# Building the Final Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
final_model = pd.read_csv('reddit.csv')
final_model.head()

Unnamed: 0.1,Unnamed: 0,Title,Upvote_ratio,Score,Gilded,Over_18,Number_of_Comments
0,0,House impeaches Trump for second time over Cap...,0.72,39464,2,False,8801
1,1,I'm a 16 yr old blacksmith and I forged this w...,0.9,29423,0,False,443
2,2,Do some research before spouting nonsense,0.9,50420,0,False,1008
3,3,Moving boxes arrive at the White House,0.81,149858,6,False,4392
4,4,“No One Took Us Seriously”: Black Cops Warned ...,0.98,7768,0,False,157


In [3]:
final_model.shape

(7471, 7)

In [4]:
#Dropping the unnamed column
final_model.drop('Unnamed: 0', axis = 1, inplace = True)

In [5]:
#Dropping duplicate columns

final_model.drop_duplicates(keep = 'first', inplace = True)

In [6]:
final_model.columns

Index(['Title', 'Upvote_ratio', 'Score', 'Gilded', 'Over_18',
       'Number_of_Comments'],
      dtype='object')

In [7]:
#NUMBER OF WORDS IN THE TITLE

import re
def word_count(text):
    wordList = re.sub("[^\w]", " ",  text).split()
    return len(wordList)

final_model['word_count'] = final_model['Title'].map(lambda x : word_count(x))

## Cleaning the title column

In [8]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [9]:
#Downloading stopwords from NLTK
#We don't remove No and Not as it will help in sentiment analysis
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.remove("not")
stop_words.remove("no")

<IPython.core.display.Javascript object>

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
#Enforcing proper spacing

def spaceURLs(df, column):
    '''This method is used to enforce proper spacing
    Ex: In the data, you may have '[the image](https://image.xyz)';
    this method creates space between alt text ("the image") and the URL.'''
    data = df.copy()
    data.loc[:, column] = data[column].str.replace('\[|\]', ' ', regex=True)
    return data

In [11]:
#Replace URLs

def replaceURL(df, column):
    data = df.copy()
    data.loc[:, column] = data[column].str.replace('\(http\S+', 'URL', regex=True)
    return data

In [12]:
#Removing symbols

def removeSymbols(df, column):
    data = df.copy()
    data.loc[:, column] = data[column].str.replace('/r/', '', regex=True)
    data.loc[:, column] = data[column].str.replace('[^\.\'A-Za-z0-9]+', ' ', regex=True)
    return data

In [13]:
#Removing numbers

def removeNumbers(df, column):
    data = df.copy()
    data.loc[:, column] = data[column].str.replace('\S*\d\S*', '', regex=True)
    return data

In [14]:
#Remove stop words

stop_words = stopwords.words('english')
stop_words.extend(["ca","http"])
def no_stop_words(list_tokens):    
    tokens_no_stop_word = []
    for elt in list_tokens: 
        tokens_no_stop_word.append([tok for tok in elt if tok not in stop_words])
    return(tokens_no_stop_word)

In [15]:
data = spaceURLs(final_model, 'Title')
data = replaceURL(final_model, 'Title')
data = removeNumbers(final_model, 'Title')
data = removeSymbols(final_model, 'Title')

In [16]:
final_model = data.copy()

In [17]:
final_model.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Title,House impeaches Trump for second time over Cap...,I'm a 16 yr old blacksmith and I forged this w...,Do some research before spouting nonsense,Moving boxes arrive at the White House,No One Took Us Seriously Black Cops Warned Ab...,US bans all cotton and tomato products from Xi...,Urban explorer of Reddit what was your I bette...,LPT As a lawyer don't take legal advice from R...,Pirate Bay Founder Thinks Parler s Inability t...,GME YOLO update Jan 13 2021
Upvote_ratio,0.72,0.9,0.9,0.81,0.98,0.88,0.93,0.94,0.88,0.89
Score,39464,29423,50420,149858,7768,112303,30184,32876,79607,37315
Gilded,2,0,0,6,0,0,0,0,2,17
Over_18,False,False,False,False,False,False,True,False,False,False
Number_of_Comments,8801,443,1008,4392,157,4072,3666,1766,3338,3016
word_count,9,18,6,7,15,12,11,11,12,6


# Sentiment Analysis

In [18]:
from textblob import TextBlob

In [19]:
# Create a function to get the subjectivity
def Subjectivity(text):
       return TextBlob(text).sentiment.subjectivity
    
# Create a function to get the polarity
def Polarity(text):
      return  TextBlob(text).sentiment.polarity

In [20]:
final_model['Subjectivity'] = final_model['Title'].apply(Subjectivity)
final_model['Polarity'] = final_model['Title'].apply(Polarity)

In [21]:
final_model.head()

Unnamed: 0,Title,Upvote_ratio,Score,Gilded,Over_18,Number_of_Comments,word_count,Subjectivity,Polarity
0,House impeaches Trump for second time over Cap...,0.72,39464,2,False,8801,9,0.0,0.0
1,I'm a 16 yr old blacksmith and I forged this w...,0.9,29423,0,False,443,18,0.2,0.1
2,Do some research before spouting nonsense,0.9,50420,0,False,1008,6,0.0,0.0
3,Moving boxes arrive at the White House,0.81,149858,6,False,4392,7,0.0,0.0
4,No One Took Us Seriously Black Cops Warned Ab...,0.98,7768,0,False,157,15,0.433333,-0.166667


In [22]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [23]:
senti = SentimentIntensityAnalyzer()
for sentence in final_model['Title']:
    ss = senti.polarity_scores(sentence)

In [24]:
import pickle

In [25]:
pickle.dump(senti, open('senti.pkl','wb'))

In [26]:
#analyzer = SentimentIntensityAnalyzer()
final_model['rating'] = final_model['Title'].apply(senti.polarity_scores)

In [27]:
final_model = pd.concat([final_model.drop(['rating'], axis=1), final_model['rating'].apply(pd.Series)], axis=1)

In [28]:
# data preprocessing
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
import nltk
from nltk.stem import WordNetLemmatizer,PorterStemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def text_preprocess(text):
    text = re.sub(r'[^\w\s]', '', text) 
    l_text = [word for word in text.lower().split() if word not in ENGLISH_STOP_WORDS]
    stem_words = [stemmer.stem(w) for w in l_text]
    lemma_words = [lemmatizer.lemmatize(w) for w in l_text]

    return " ".join(lemma_words)

final_model['Title'] = final_model['Title'].map(lambda x : text_preprocess(x))

In [29]:
final_model.reset_index(inplace=True, drop=True)

# Using the entire dataset to train the model

# Splitting the dataset into features and dependent variable

In [30]:
X = final_model.drop(['Score'], axis = 1)
y = final_model.Score

## One Hot Encoding the over_18 column

In [31]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(X[['Over_18']])
pickle.dump(ohe, open('encoding.pkl','wb'))
col_names = [j for sub in ohe.categories_ for j in sub]

In [32]:
col_names

[False, True]

In [33]:
#Transforming the encoding

trans = ohe.transform(X[['Over_18']])

In [34]:
encoded = pd.DataFrame(trans.todense(), columns=col_names)

**Converting title column to vector**

In [35]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X['Title'])

Using TensorFlow backend.


In [36]:
word_ind = tokenizer.word_index
word_token = {k:[word_ind[k]] for k in word_ind}
df_word_token = pd.DataFrame(word_token)
df_word_token.to_csv('word_token_final.csv', header=True, index=False)

In [37]:
glove_txt = open('glove.6B.100d.txt', encoding="utf8")
embed_dict = dict()
for line in glove_txt:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embed_dict[word] = vector_dimensions
glove_txt.close()

embed_mat_final = np.zeros((len(word_ind) + 1, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embed_dict.get(word)
    if embedding_vector is not None:
        embed_mat_final[index] = embedding_vector

In [38]:
embedded_matrix = pd.DataFrame(data=embed_mat_final.astype(float))
embedded_matrix.to_csv('embedded_final.csv', sep = ' ', header =True, float_format='%.2f', index=False)

In [39]:
from keras.preprocessing.sequence import pad_sequences
max_len = 300

final_title = pad_sequences(tokenizer.texts_to_sequences(X["Title"]), padding='post', maxlen=max_len)
train = {}
final_train_title = []
for i, sentence in enumerate(final_title):
    vectors = []
    for n in sentence:
        vectors.append(embed_mat_final[n])
    train[i] = vectors
for key in train:
    arr = np.array(train[key])
    final_train_title.append(np.mean(arr, axis=0))

In [40]:
final_train_title = pd.DataFrame(np.array(final_train_title))
final_train_title.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.002113,-0.004449,0.007958,-0.001411,-0.001737,0.011883,-0.003144,0.003628,0.000127,0.002026,...,0.006714,-0.006593,-0.004499,0.004678,-0.009925,-0.002041,0.002959,0.003787,0.011562,-0.005068
1,0.004027,0.010706,0.003907,-0.013489,-0.001871,7.8e-05,0.008044,0.000476,-0.008465,-0.00502,...,-0.001516,0.007992,0.008821,0.003444,-0.009619,-0.003693,-0.003268,-0.005906,0.004614,-0.003194
2,-0.000314,0.003569,0.00201,-0.002045,0.00014,0.000917,0.003223,-0.003422,-0.004678,0.002914,...,0.00102,0.000139,0.000546,-0.000601,0.003563,0.001986,-0.005421,-0.007119,0.002071,0.007355
3,-0.004728,0.001405,0.007781,-0.002336,0.001861,0.005995,-0.001082,0.005549,-0.000758,-0.002288,...,-0.000519,-0.001332,0.003333,0.002879,-0.01217,-0.003115,-0.002669,0.003307,0.011029,-0.004028
4,0.004433,-0.005766,0.005804,-0.017402,-0.002426,0.014638,-0.002381,0.006815,-0.000434,0.011609,...,0.005386,-0.014872,-0.008367,0.003606,-0.008192,0.0097,-0.006349,0.001086,0.004418,-0.000856


In [41]:
X.columns

Index(['Title', 'Upvote_ratio', 'Gilded', 'Over_18', 'Number_of_Comments',
       'word_count', 'Subjectivity', 'Polarity', 'neg', 'neu', 'pos',
       'compound'],
      dtype='object')

In [42]:
X.reset_index(inplace=True, drop=True)

In [43]:
X = pd.concat([X, final_train_title, encoded], axis=1)

In [44]:
X.columns

Index([             'Title',       'Upvote_ratio',             'Gilded',
                  'Over_18', 'Number_of_Comments',         'word_count',
             'Subjectivity',           'Polarity',                'neg',
                      'neu',
       ...
                         92,                   93,                   94,
                         95,                   96,                   97,
                         98,                   99,                False,
                       True],
      dtype='object', length=114)

In [45]:
X.drop(['Over_18', 'Title'], axis = 1, inplace = True)

In [46]:
import numpy.ma as ma
np.where(np.isnan(X), ma.array(X, mask=np.isnan(X)).mean(axis=0), X)

array([[ 7.20000000e-01,  2.00000000e+00,  8.80100000e+03, ...,
        -5.06847673e-03,  1.00000000e+00,  0.00000000e+00],
       [ 9.00000000e-01,  0.00000000e+00,  4.43000000e+02, ...,
        -3.19399011e-03,  1.00000000e+00,  0.00000000e+00],
       [ 9.00000000e-01,  0.00000000e+00,  1.00800000e+03, ...,
         7.35459983e-03,  1.00000000e+00,  0.00000000e+00],
       ...,
       [ 9.40000000e-01,  0.00000000e+00,  0.00000000e+00, ...,
        -2.27840001e-03,  1.00000000e+00,  0.00000000e+00],
       [ 9.80000000e-01,  0.00000000e+00,  2.27000000e+02, ...,
         5.14300019e-03,  1.00000000e+00,  0.00000000e+00],
       [ 8.70000000e-01,  0.00000000e+00,  1.00000000e+01, ...,
         3.64199996e-03,  1.00000000e+00,  0.00000000e+00]])

In [47]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7469 entries, 0 to 7468
Columns: 112 entries, Upvote_ratio to True
dtypes: float64(109), int64(3)
memory usage: 6.4 MB


## Modelling

In [48]:
from xgboost import XGBRegressor
xgb = XGBRegressor(random_state = 21)

In [49]:
xgb.fit(X, y)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=21,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [50]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn import metrics
train_pred = xgb.predict(X)
r2 = r2_score(y, y)
mse = mean_squared_error(y, y)
mae = mean_absolute_error(y, y)
print(f"R-Sq of train data: {r2:.4}")
print(f"RMSE of train data: {np.sqrt(mse)}")
print(f"MAE  of train data: {mae}")

R-Sq of train data: 1.0
RMSE of train data: 0.0
MAE  of train data: 0.0


**Creating Booster file to upload XGBRegressor**

In [51]:
from xgboost import Booster
xgb.save_model('xgb.booster')