The nltk library along with the TfidfVectorizer was used to preprocess the data. The titles were tokenized, punctuations etc. with non-alphabetic characters as well as stop words were removed. The data was then lemmatized and the dictionary consisting of only more frequent words was created. This dictionary along with preprocessed data was then used in the TfidfVectorizer to create input X to be used in various models.

In [1]:
import pandas as pd
data = pd.read_csv('redditData1.csv')

In [2]:
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [13]:
word_list = []
for t in data['title']:
    word_list = word_list + word_tokenize(t.lower())
    
from collections import Counter
common_words = list(Counter(word_list).most_common(2500))

In [15]:
word_list = sorted(list(set(word_list)))
word_list = [item for item in word_list if item.isalpha()]

word_list = [x[0] for x in common_words]

stop_words = set(stopwords.words('english'))
word_list_sw = [w for w in word_list if not w in stop_words]

lemmatizer = WordNetLemmatizer()
word_list_lem = [lemmatizer.lemmatize(w) for w in word_list_sw]
word_list_lem = sorted(list(set(word_list_lem)))

In [16]:
len(word_list_lem)

2128

In [17]:
word_list_lem

['!',
 '#',
 '$',
 '%',
 '&',
 "'",
 "''",
 "'corona",
 "'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '--',
 '.',
 '..',
 '...',
 '/',
 '1',
 '1,000',
 '1,500',
 '10',
 '10,000',
 '100',
 '1000',
 '10th',
 '11',
 '12',
 '13',
 '14',
 '14th',
 '15',
 '16',
 '17',
 '18',
 '19',
 '1st',
 '2',
 '2.0',
 '20',
 '20,000',
 '200',
 '2017',
 '2018',
 '2019',
 '2020',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '2yodoindia',
 '3',
 '30',
 '300',
 '31',
 '39',
 '3d',
 '3rd',
 '4',
 '40',
 '400',
 '48',
 '480p',
 '4th',
 '5',
 '5.7',
 '50',
 '50,000',
 '500',
 '54',
 '5th',
 '6',
 '60',
 '7',
 '8',
 '9',
 '9.99',
 '90',
 '93',
 '9pm',
 ':',
 ';',
 '?',
 '@',
 '[',
 ']',
 '``',
 'aap',
 'aarogya',
 'abhijit',
 'able',
 'abp',
 'abroad',
 'absolute',
 'absolutely',
 'abuse',
 'abused',
 'ac',
 'accept',
 'access',
 'according',
 'account',
 'accused',
 'across',
 'act',
 'action',
 'active',
 'activity',
 'actor',
 'actual',
 'actually',
 'ad',
 'addre

In [219]:
with open('dict.pkl', 'wb') as f:
    pickle.dump(word_list_lem, f, protocol=2)

In [38]:
def preprocess(content):
    result = []
    for s in content:
        data = word_tokenize(s.lower())
        stop_words = set(stopwords.words('english'))
        data = [w for w in data if not w in stop_words]
        lemmatizer = WordNetLemmatizer()
        data_lem = [lemmatizer.lemmatize(w) for w in data]
        data_lem = sorted(list(set(data_lem)))
        
        data = ' '.join(word for word in data_lem)
        result.append(data)
    return result

In [46]:
clf = TfidfVectorizer(input = 'content', vocabulary = word_list_lem)
X = clf.fit_transform(preprocess(data['title']))
#X = clf.fit_transform(data['title'])
clf.get_feature_names()
X = X.toarray()

In [40]:
X.shape

(12063, 2128)

A dictionary was used to map the flair labels to numbers.

In [25]:
flairs = data['link_flair_text'].unique()
f_dict = dict(zip(flairs, range(len(flairs))))

In [26]:
f_dict

{'Non-Political': 0,
 'Coronavirus': 1,
 'AskIndia': 2,
 'Photography': 3,
 'Science/Technology': 4,
 'Politics': 5,
 'Policy/Economy': 6,
 'Business/Finance': 7,
 'Sports': 8,
 'Food': 9}

In [31]:
df = data.applymap(lambda s: f_dict.get(s) if s in f_dict else s)

In [52]:
y = np.array(df['link_flair_text'])

In [54]:
y.shape

(12063,)

The data was then split into training and testing sets. SVC and Random Forest was used. However, since the dataset was not balanced, most of the times, the majority class was predicted.

In [55]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [56]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [57]:
model = SVC(kernel='rbf', C=1.0, gamma = 'scale')
    
model.fit(X_train, y_train)
p = model.predict(X_test)

In [58]:
acc = accuracy_score(y_test, p)
cm = confusion_matrix(y_test, p)
print(acc)
print(cm)

0.42470295661785024
[[   0  678    0    0    0    0    0    0    0    0]
 [   0 1537    0    0    0    0    0    0    0    0]
 [   0  508    0    0    0    0    0    0    0    0]
 [   0  103    0    0    0    0    0    0    0    0]
 [   0  133    0    0    0    0    0    0    0    0]
 [   0  382    0    0    0    0    0    0    0    0]
 [   0  111    0    0    0    0    0    0    0    0]
 [   0   95    0    0    0    0    0    0    0    0]
 [   0   35    0    0    0    0    0    0    0    0]
 [   0   37    0    0    0    0    0    0    0    0]]


In [62]:
from sklearn.ensemble import RandomForestClassifier

rf_model=RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train,y_train)
rf_pred=rf_model.predict(X_test)

In [63]:
acc = accuracy_score(y_test, rf_pred)
cm = confusion_matrix(y_test, rf_pred)
print(acc)
print(cm)

0.5473887814313346
[[ 248  275   78   10   10   48    3    2    1    3]
 [ 100 1306   68    7    6   42    4    2    0    2]
 [ 109  179  180    5    7   18    1    6    0    3]
 [  27   20    5   47    0    2    0    0    1    1]
 [  30   50   17    0   26    3    0    6    1    0]
 [  55  172   23    1    0  129    2    0    0    0]
 [  18   55   15    1    1    4   15    2    0    0]
 [  15   40   17    0    3    0    2   17    0    1]
 [   8   17    2    0    0    1    0    0    6    1]
 [   8   15    5    1    0    1    0    0    0    7]]


In order to tackle the imbalance of the data, the class_weight keyword was used.

In [64]:
svc_model = SVC(kernel='rbf', C=1.0, gamma = 'scale', class_weight = 'balanced')
    
svc_model.fit(X_train, y_train)
p = svc_model.predict(X_test)

In [66]:
acc = accuracy_score(y_test, p)
cm = confusion_matrix(y_test, p)
print(acc)
print(cm)

0.2555954683614258
[[ 666    4    1    2    0    0    1    3    0    1]
 [1321  190    1    0    0    9   12    0    1    3]
 [ 476   16    1    5    0    1    1    2    2    4]
 [  83    0    0   19    0    0    0    0    1    0]
 [ 129    1    0    0    0    0    0    3    0    0]
 [ 362    6    0    0    0   10    4    0    0    0]
 [  97    2    0    1    0    0    9    2    0    0]
 [  77    2    0    0    0    0    2   12    1    1]
 [  26    1    0    0    0    0    0    0    8    0]
 [  27    0    0    0    0    0    0    0    0   10]]


In [67]:
rf_model=RandomForestClassifier(n_estimators=100, class_weight = 'balanced')
rf_model.fit(X_train,y_train)
rf_pred=rf_model.predict(X_test)

In [68]:
acc = accuracy_score(y_test, rf_pred)
cm = confusion_matrix(y_test, rf_pred)
print(acc)
print(cm)

0.5288753799392097
[[ 233  229   83   18   10   46    5    5    3   46]
 [ 111 1225   70    7   10   52    9    6    0   47]
 [ 105  155  174    9    8   26    1   10    2   18]
 [  31   13    6   46    0    1    0    0    2    4]
 [  25   32   18    2   37    5    0    5    1    8]
 [  57  145   18    2    2  141    2    1    0   14]
 [  13   50   16    1    1    5   17    4    0    4]
 [  13   35   18    0    1    1    1   19    0    7]
 [   6   15    2    0    0    1    0    1    8    2]
 [   5   10    5    1    0    1    0    1    0   14]]


Then, I tried oversampling from the minority class to create a new training set. This was used to train random forest and XGBoost models.

In [81]:
from imblearn.over_sampling import SMOTE

In [82]:
smote = SMOTE('minority')
X_s, y_s = smote.fit_sample(X_train, y_train)

In [83]:
X_s.shape

(11962, 2128)

In [245]:
rf_model=RandomForestClassifier(n_estimators=100)
rf_model.fit(X_s,y_s)
rf_pred=rf_model.predict(X_test)

In [246]:
acc = accuracy_score(y_test, rf_pred)
cm = confusion_matrix(y_test, rf_pred)
print(acc)
print(cm)

0.5377176015473888
[[ 241  277   67    8    7   41    4    3   26    4]
 [ 100 1300   61    6    7   45    4    3    9    2]
 [ 101  179  166    7    7   20    2    7   17    2]
 [  21   22    5   44    0    2    0    0    8    1]
 [  29   45   14    0   23    5    0    4   13    0]
 [  63  175   15    1    1  121    2    0    4    0]
 [  13   59   16    0    0    4   14    3    2    0]
 [  15   38   14    0    0    0    3   16    7    2]
 [   7   13    2    0    0    0    0    0   13    0]
 [   8   12    6    1    0    1    0    0    1    8]]


In [247]:
with open('model_rf.pkl.z', 'wb') as file:
    pickle.dump(rf_model, file, protocol=2)

In [208]:
from xgboost import XGBClassifier

model_xgb = XGBClassifier()
model_xgb.fit(X_s, y_s)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints=None,
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=None, subsample=1, tree_method=None,
       validate_parameters=False, verbosity=None)

In [209]:
xgb_pred=model_xgb.predict(X_test)

In [210]:
acc = accuracy_score(y_test, xgb_pred)
cm = confusion_matrix(y_test, xgb_pred)
print(acc)
print(cm)

0.5363360044211108
[[ 332  208   62   10    9   38    3    6    9    1]
 [ 194 1229   43    3    5   45    8    2    7    1]
 [ 153  153  150    5    6   19    0   10   11    1]
 [  40   21    2   32    1    0    0    0    7    0]
 [  46   41   16    0   14    4    0    5    7    0]
 [  95  141   12    1    1  126    4    0    1    1]
 [  27   46    9    1    2    3   18    5    0    0]
 [  25   30   11    0    2    1    1   20    4    1]
 [  11    8    0    0    0    3    0    1   12    0]
 [  14   12    3    0    0    0    0    0    0    8]]


In [227]:
import pickle
import joblib
# with open('model_xgb.pkl', 'wb') as file:
#     pickle.dump(model_xgb, file, protocol=2)
# joblib.dump(model_xgb, 'model.pkl.z', protocol = 2) 

# pickle.dump(model_xgb, open("pima.pickle.dat", "wb"), protocol = 2)

I then tried a combination of oversampling and undersampling. This, however, did not give improved results.

In [90]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

oversample = RandomOverSampler(sampling_strategy='minority')
undersample = RandomUnderSampler(sampling_strategy='majority')

In [92]:
X_o, y_o = oversample.fit_resample(X, y)

In [93]:
X_ou, y_ou = undersample.fit_resample(X_o, y_o)

In [94]:
X_ou.shape

(12098, 2128)

In [95]:
model_xgb.fit(X_ou, y_ou)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints=None,
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=None, subsample=1, tree_method=None,
       validate_parameters=False, verbosity=None)

In [96]:
xgb_pred=model_xgb.predict(X_test)
acc = accuracy_score(y_test, xgb_pred)
cm = confusion_matrix(y_test, xgb_pred)
print(acc)
print(cm)

0.4249792760431058
[[590   0  35   5   3  25   4   3  11   2]
 [874  87 211   6  19 250  44  11  29   6]
 [168   0 314   2   3  13   0   1   7   0]
 [ 29   0   2  70   0   0   0   0   2   0]
 [ 51   0  12   0  58   6   0   2   4   0]
 [122   0   9   0   0 248   2   0   1   0]
 [ 34   0   5   0   0   4  66   2   0   0]
 [ 32   0   6   0   0   1   1  51   2   2]
 [  2   0   0   0   0   0   0   0  33   0]
 [ 13   0   2   0   0   0   0   0   1  21]]


Finally, I used Keras to create a fairly simple DNN for the classification of reddit submissions. This, too, did not give as good of a result as desired.

In [116]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
tokenizer_obj = Tokenizer()

X1 = np.array(df['title'])
y1 = np.array(df['link_flair_text'])
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.30, random_state=42)

In [129]:
total = np.array(list(X1_train) + list(X1_test))
tokenizer_obj.fit_on_texts(total)

#pad the sequences
max_len = max([len(s.split()) for s in total])

vocab_size = len(tokenizer_obj.word_index) + 1

X1_train_tokens = tokenizer_obj.texts_to_sequences(X1_train)
X1_test_tokens = tokenizer_obj.texts_to_sequences(X1_test)

X1_train_pad = pad_sequences(X1_train_tokens, maxlen = max_len, padding = 'post')
X1_test_pad = pad_sequences(X1_test_tokens, maxlen = max_len, padding = 'post')

In [130]:
smote = SMOTE('minority')
X1_s, y1_s = smote.fit_sample(X1_train_pad, y1_train)

In [202]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding

EMB_DIM = 100
num_labels = 10

model = Sequential()
model.add(Embedding(vocab_size, EMB_DIM, input_length = max_len))
model.add(GRU(units = 32, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(num_labels, activation = 'softmax'))

model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

print("Model summary:")
print(model.summary())

Model summary:
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 62, 100)           1763200   
_________________________________________________________________
gru_15 (GRU)                 (None, 32)                12768     
_________________________________________________________________
dense_14 (Dense)             (None, 10)                330       
Total params: 1,776,298
Trainable params: 1,776,298
Non-trainable params: 0
_________________________________________________________________
None


In [203]:
y = np.copy(y1_s)
y_oh = np.zeros((y.shape[0], 10))
for i in range(y.shape[0]):
    y_oh[i][y[i]] = 1

In [204]:
num_epochs = 15
batch_size = 128
model.fit(X1_s, y_oh, batch_size = batch_size, epochs = num_epochs, verbose = 2, validation_split = 0.15)

Train on 10167 samples, validate on 1795 samples
Epoch 1/15
 - 15s - loss: 1.9632 - acc: 0.3328 - val_loss: 1.6807 - val_acc: 0.0000e+00
Epoch 2/15
 - 10s - loss: 1.8420 - acc: 0.3526 - val_loss: 1.7517 - val_acc: 0.0000e+00
Epoch 3/15
 - 11s - loss: 1.8417 - acc: 0.3526 - val_loss: 1.7280 - val_acc: 0.0000e+00
Epoch 4/15
 - 11s - loss: 1.8398 - acc: 0.3526 - val_loss: 1.7578 - val_acc: 0.0000e+00
Epoch 5/15
 - 11s - loss: 1.8397 - acc: 0.3526 - val_loss: 1.7114 - val_acc: 0.0000e+00
Epoch 6/15
 - 11s - loss: 1.8401 - acc: 0.3526 - val_loss: 1.7981 - val_acc: 0.0000e+00
Epoch 7/15
 - 12s - loss: 1.8242 - acc: 0.3526 - val_loss: 1.7708 - val_acc: 0.0000e+00
Epoch 8/15
 - 11s - loss: 1.6935 - acc: 0.4129 - val_loss: 1.6803 - val_acc: 0.4825
Epoch 9/15
 - 11s - loss: 1.5387 - acc: 0.4765 - val_loss: 1.8583 - val_acc: 0.3705
Epoch 10/15
 - 11s - loss: 1.4181 - acc: 0.5204 - val_loss: 2.0282 - val_acc: 0.2953
Epoch 11/15
 - 11s - loss: 1.3011 - acc: 0.5775 - val_loss: 1.9990 - val_acc: 0.36

<keras.callbacks.History at 0x1e4958f28d0>

In [205]:
y = np.copy(y1_test)
y_oh_test = np.zeros((y.shape[0], 10))
for i in range(y.shape[0]):
    y_oh_test[i][y[i]] = 1

In [206]:
score, acc = model.evaluate(X1_test_pad, y_oh_test, batch_size = batch_size, verbose = 2)
acc

0.3702680299330825

XGBoost and Random Forest models with oversampled data had the best results. The Random Forest model was saved to use in the final application.