## Import Relevant Libraries

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import re
from pathlib import Path
from tqdm import tqdm

In [3]:
import xgboost as xgb
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline,linear_model
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [42]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('stopwords')
nltk.download('punkt')

stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read Data from csv
The dataset consists of details of 200 posts for each flair in r/india. The data was collected on 15 April,2020. 

Each posts has 10 comments and the flairs used are 'AskIndia', 'Business/Finance', 'Food', 'Non-Political', 'Photography', 'Policy/Economy', 'Politics', 'Science/Technology','Sports'

In [5]:
PATH=Path("../")

In [77]:
df=pd.read_csv(PATH/'data/data.csv')
df.head(3)

Unnamed: 0,title,score,id,body,author,flair,url,comms_num,created,comment,authors,combined_features
0,"Lost my Job, Sick Mother and Paralysed Dad, In...",1042,g014wc,Hi....It's really tough time for everyone. I r...,sanand_satwik,AskIndia,https://www.reddit.com/r/india/comments/g014wc...,132,1586742000.0,I'm a freelancer. Don't listen to the idiots ...,hashedram diabapp xataari Aashayrao sarcrasti...,"Lost my Job, Sick Mother and Paralysed Dad, In..."
1,Why does the government come with a begging bo...,647,fxofyu,"We have floods, terrorist attacks, famines due...",TWO-WHEELER-MAFIA,AskIndia,https://www.reddit.com/r/india/comments/fxofyu...,205,1586448000.0,I don't understand why they don't use money f...,Kinky-Monk ak32009 fools_eye None DwncstSheep...,Why does the government come with a begging bo...
2,Mother's condition is going worse due to hepat...,756,g0zlly,"Hi folks, I really appreciate the warm respons...",sanand_satwik,AskIndia,https://www.reddit.com/r/india/comments/g0zlly...,92,1586871000.0,If anyone knows who is influential on Twitter...,AlternativeDrop6 TheRobotsHaveCome lanky32 pl...,Mother's condition is going worse due to hepat...


### Using combination of title, body text and the comments as features

In [79]:
#using only relevant columns
df=df[['flair','combined_features']]
# df['title'] = df['title'].astype(str)+df['body'].astype(str)

df["flair"].apply(lambda x:str(x))
df["combined_features"].apply(lambda x:str(x))

In [80]:
#get list of flairs in the dataset and convert into integers labels
all_flairs=list(np.unique(df['flair']))
df['flair']=df['flair'].apply(lambda x :all_flairs.index(x))
all_flairs=list(np.unique(df['flair']))

#### Data cleaning
Two cleaning methods are used:
1. Stemming
2. Without stemming

In [81]:
# Preprocessing the data using two methods.
def preprocess_stem(x):
    stemmer = PorterStemmer()
    x = x.lower()
    x = re.sub("[^a-z0-9'@+-/]",' ',x)
    tokens = x.split()
    new_tok = [i for i in tokens if i not in Stopwords]
    return ' '.join([stemmer.stem(i) for i in new_tok])

def preprocess(x):
    x = x.lower()
    x = re.sub("[^a-z0-9'@+-/]",' ',x)
    tokens = x.split()
    new_tok = [i for i in tokens if i not in Stopwords]
    return ' '.join(new_tok)

Data preprocessing can be skipped as the results were nearly similar after and before cleaning

In [56]:
Stopwords = list(stopwords.words('english'))
df["title"].apply(lambda x:preprocess_stem(x))
pass

In [83]:
## Divide into train and test set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df['title'],df['flair'],random_state=2020)

## Convert text data into vectors

4 different methods are used to vectorize the words:
1. Count based Vectorization
2. Word level Term-frequency inverse document frequency (tf-idf)
3. n-gram level tfidf
4. character level tf-idf

In [84]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)

## Classification Models
Various classification models are used for classification: Linear SVM, Random Forest, Naive Bayes, Logistic Regression, SVM.

The scores are reported on validation set for all 4 vectorization methods and the best one is selected to report the accuracy.

In [71]:
print('===========================   Linear SVM =====================')


vectorizers=[(count_vect,'count_vectorizer'),(tfidf_vect,'tfidf_vectorizer_word'),(tfidf_vect_ngram,'tfidf_vectorizer_word_ngram'),(tfidf_vect_ngram_chars,'tfidf_vectorizer_ngram_chars')]

for vectorizer in vectorizers:
    print(vectorizer[1])
    clf = Pipeline([
    ('vect',vectorizer[0]),
        ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=4, tol=None))
    ])

    clf = clf.fit(X_train, y_train)
    test_preds=clf.predict(X_test)

    auc = classification_report(y_test, test_preds)
    print (auc)


count_vectorizer
                    precision    recall  f1-score   support

          AskIndia       0.37      0.57      0.45        44
  Business/Finance       0.26      0.59      0.36        44
              Food       0.67      0.31      0.43        51
     Non-Political       0.36      0.23      0.28        52
       Photography       0.61      0.84      0.70        51
    Policy/Economy       0.39      0.37      0.38        49
          Politics       0.58      0.40      0.47        55
Science/Technology       0.35      0.24      0.28        51
            Sports       0.86      0.57      0.68        53

          accuracy                           0.45       450
         macro avg       0.49      0.46      0.45       450
      weighted avg       0.50      0.45      0.45       450

tfidf_vectorizer_word
                    precision    recall  f1-score   support

          AskIndia       0.52      0.59      0.55        44
  Business/Finance       0.47      0.43      0.45        

In [23]:
print('===========================   Random Forest =====================')


vectorizers=[(count_vect,'count_vectorizer'),(tfidf_vect,'tfidf_vectorizer_word'),(tfidf_vect_ngram,'tfidf_vectorizer_word_ngram'),(tfidf_vect_ngram_chars,'tfidf_vectorizer_ngram_chars')]

for vectorizer in vectorizers:
    print(vectorizer[1])
    clf = Pipeline([
    ('vect',vectorizer[0]),
    ('clf', RandomForestClassifier(n_estimators = 80, random_state = 42)),
    ])
    
    clf = clf.fit(X_train, y_train)
    predictions=clf.predict_proba(X_test)
    test_preds=np.argmax(predictions,axis=1)

    auc = classification_report(y_test, test_preds)
    print (auc)


count_vectorizer
              precision    recall  f1-score   support

           0       0.36      0.82      0.50        44
           1       0.45      0.39      0.41        44
           2       0.33      0.20      0.25        51
           3       0.20      0.25      0.22        52
           4       0.71      0.69      0.70        51
           5       0.44      0.37      0.40        49
           6       0.46      0.47      0.47        55
           7       0.25      0.12      0.16        51
           8       0.70      0.62      0.66        53

    accuracy                           0.43       450
   macro avg       0.43      0.44      0.42       450
weighted avg       0.44      0.43      0.42       450

tfidf_vectorizer_word
              precision    recall  f1-score   support

           0       0.40      0.84      0.54        44
           1       0.48      0.36      0.42        44
           2       0.39      0.27      0.32        51
           3       0.15      0.15      

In [93]:
print('===========================   Naive Bayes =====================')


vectorizers=[(count_vect,'count_vectorizer'),(tfidf_vect,'tfidf_vectorizer_word'),(tfidf_vect_ngram,'tfidf_vectorizer_word_ngram'),(tfidf_vect_ngram_chars,'tfidf_vectorizer_ngram_chars')]

#Naive Bayes for all features:
for vectorizer in vectorizers:
    print(vectorizer[1])
    clf = Pipeline([
    ('vect',vectorizer[0]),
    ('clf', MultinomialNB()),
    ])
    clf = clf.fit(X_train, y_train)
    predictions=clf.predict_proba(X_test)
    test_preds=np.argmax(predictions,axis=1)

    auc = classification_report(y_test, test_preds)
    print (auc)


count_vectorizer
              precision    recall  f1-score   support

           0       0.24      0.89      0.38        44
           1       0.62      0.23      0.33        44
           2       0.92      0.47      0.62        51
           3       0.17      0.15      0.16        52
           4       0.93      0.25      0.40        51
           5       0.38      0.53      0.44        49
           6       0.48      0.76      0.59        55
           7       0.56      0.10      0.17        51
           8       1.00      0.38      0.55        53

    accuracy                           0.42       450
   macro avg       0.59      0.42      0.41       450
weighted avg       0.60      0.42      0.41       450

tfidf_vectorizer_word
              precision    recall  f1-score   support

           0       0.20      0.89      0.33        44
           1       0.61      0.32      0.42        44
           2       0.91      0.41      0.57        51
           3       0.43      0.17      

In [86]:
print('===========================   Logistic Regression =====================')

vectorizers=[(count_vect,'count_vectorizer'),(tfidf_vect,'tfidf_vectorizer_word'),(tfidf_vect_ngram,'tfidf_vectorizer_word_ngram'),(tfidf_vect_ngram_chars,'tfidf_vectorizer_ngram_chars')]

for vectorizer in vectorizers:
    print(vectorizer[1])
    clf = Pipeline([
    ('vect',vectorizer[0]),
    ('clf', linear_model.LogisticRegression(multi_class='auto',solver='lbfgs')),
    ])
    clf = clf.fit(X_train, y_train)
    predictions=clf.predict_proba(X_test)
    test_preds=np.argmax(predictions,axis=1)

    auc = classification_report(y_test, test_preds)
    print (auc)

count_vectorizer


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       0.62      0.45      0.53        44
           1       0.45      0.43      0.44        44
           2       0.74      0.57      0.64        51
           3       0.41      0.42      0.42        52
           4       0.76      0.80      0.78        51
           5       0.44      0.45      0.44        49
           6       0.63      0.62      0.62        55
           7       0.49      0.53      0.51        51
           8       0.66      0.87      0.75        53

    accuracy                           0.58       450
   macro avg       0.58      0.57      0.57       450
weighted avg       0.58      0.58      0.57       450

tfidf_vectorizer_word
              precision    recall  f1-score   support

           0       0.55      0.59      0.57        44
           1       0.55      0.50      0.52        44
           2       0.79      0.67      0.72        51
           3       0.50      0.44      0.47        52
  

In [87]:
print('===========================   SVM =====================')


vectorizers=[(count_vect,'count_vectorizer'),(tfidf_vect,'tfidf_vectorizer_word'),(tfidf_vect_ngram,'tfidf_vectorizer_word_ngram'),(tfidf_vect_ngram_chars,'tfidf_vectorizer_ngram_chars')]

# SVM on count vectors: SVM Classifier Pipeline on word count vector
for vectorizer in vectorizers:
    print(vectorizer[1])
    clf = Pipeline([
    ('vect',vectorizer[0]),
    ('clf', SVC(gamma='scale',probability=True)),
    ])
    clf = clf.fit(X_train, y_train)
    predictions=clf.predict_proba(X_test)
    test_preds=np.argmax(predictions,axis=1)
    auc = classification_report(y_test, test_preds)
    print (auc)

count_vectorizer
              precision    recall  f1-score   support

           0       0.39      0.68      0.50        44
           1       0.25      0.20      0.23        44
           2       0.33      0.24      0.28        51
           3       0.32      0.17      0.23        52
           4       0.56      0.59      0.57        51
           5       0.32      0.20      0.25        49
           6       0.46      0.65      0.54        55
           7       0.27      0.12      0.16        51
           8       0.41      0.68      0.51        53

    accuracy                           0.40       450
   macro avg       0.37      0.39      0.36       450
weighted avg       0.37      0.40      0.36       450

tfidf_vectorizer_word
              precision    recall  f1-score   support

           0       0.52      0.55      0.53        44
           1       0.50      0.43      0.46        44
           2       0.83      0.59      0.69        51
           3       0.48      0.56      

#### The logistic regression model performs the best among all models. This model is saved using pickle module and used for  final prediction.

In [None]:
import pickle
clf = Pipeline([
    ('vect',tfidf_vect),
    ('clf', linear_model.LogisticRegression(multi_class='auto',solver='lbfgs')),
  ])
clf = clf.fit(X_train, y_train)

filename = 'logreg_tfidf.sav'
pickle.dump(clf, open(filename, 'wb'))
