# Task-3 - Building a Flair Detector

### Importing Required Modules

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

flairs = [  "Coronavirus", "Politics", "Policy/Economy", "AskIndia", "Science/Technology",
            "Non-Political" , "Business/Finance"]

def string_form(value):
    return str(value)

def clean_text(text):
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

data_tab = pd.read_csv('Reddit_Data.csv')

In [3]:
#Text Pre-Processing
data_tab['title'] = data_tab['title'].apply(string_form)

data_tab['title'] = data_tab['title'].apply(clean_text)

### Flair Detection using Title as feature

In [4]:
y = data_tab.flair
X = data_tab.title

#Setting training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.1,random_state = 42)

In [5]:
#Naive Byes Classifier
from sklearn.naive_bayes import MultinomialNB

nb = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
    
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print("Results of Naive Bayes Classifier")
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred, target_names=flairs))

Results of Naive Bayes Classifier
accuracy 0.7315436241610739
                    precision    recall  f1-score   support

       Coronavirus       1.00      0.55      0.71        20
          Politics       0.00      0.00      0.00         1
    Policy/Economy       0.63      1.00      0.77        67
          AskIndia       1.00      0.76      0.87        38
Science/Technology       0.00      0.00      0.00         3
     Non-Political       1.00      0.11      0.19        19
  Business/Finance       0.00      0.00      0.00         1

          accuracy                           0.73       149
         macro avg       0.52      0.35      0.36       149
      weighted avg       0.80      0.73      0.69       149



  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
#Linear SVM
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None))])
    
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
    
print("Results of Linear Support Vector Machine")
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=flairs))

Results of Linear Support Vector Machine
accuracy 0.8187919463087249
                    precision    recall  f1-score   support

       Coronavirus       0.78      0.70      0.74        20
          Politics       0.00      0.00      0.00         1
    Policy/Economy       0.78      0.96      0.86        67
          AskIndia       0.86      0.84      0.85        38
Science/Technology       0.00      0.00      0.00         3
     Non-Political       1.00      0.63      0.77        19
  Business/Finance       0.00      0.00      0.00         1

          accuracy                           0.82       149
         macro avg       0.49      0.45      0.46       149
      weighted avg       0.80      0.82      0.80       149



  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), 
                   ('clf', LogisticRegression(n_jobs=1, C=1e5))])
    
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print("Results of Logistic Regression")
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=flairs))

Results of Logistic Regression
accuracy 0.8993288590604027
                    precision    recall  f1-score   support

       Coronavirus       0.79      0.75      0.77        20
          Politics       0.00      0.00      0.00         1
    Policy/Economy       0.89      0.94      0.91        67
          AskIndia       0.92      0.95      0.94        38
Science/Technology       1.00      0.33      0.50         3
     Non-Political       1.00      1.00      1.00        19
  Business/Finance       0.00      0.00      0.00         1

          accuracy                           0.90       149
         macro avg       0.66      0.57      0.59       149
      weighted avg       0.89      0.90      0.89       149



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
  
ranfor = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), 
                   ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42))])

ranfor.fit(X_train, y_train)
y_pred = ranfor.predict(X_test)

print("Results of Random Forest")
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=flairs))

Results of Random Forest
accuracy 0.9261744966442953
                    precision    recall  f1-score   support

       Coronavirus       1.00      0.75      0.86        20
          Politics       0.00      0.00      0.00         1
    Policy/Economy       0.89      1.00      0.94        67
          AskIndia       0.92      0.95      0.94        38
Science/Technology       1.00      0.33      0.50         3
     Non-Political       1.00      1.00      1.00        19
  Business/Finance       0.00      0.00      0.00         1

          accuracy                           0.93       149
         macro avg       0.69      0.58      0.61       149
      weighted avg       0.92      0.93      0.92       149



  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
#MLP Classifier
from sklearn.neural_network import MLPClassifier
  
mlp = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), 
                ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30)))])

mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)

print("Results of MLP Classifier")
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=flairs))

Results of MLP Classifier
accuracy 0.87248322147651
                    precision    recall  f1-score   support

       Coronavirus       0.71      0.75      0.73        20
          Politics       0.00      0.00      0.00         1
    Policy/Economy       0.88      0.88      0.88        67
          AskIndia       0.92      0.95      0.94        38
Science/Technology       1.00      0.33      0.50         3
     Non-Political       0.90      1.00      0.95        19
  Business/Finance       0.00      0.00      0.00         1

          accuracy                           0.87       149
         macro avg       0.63      0.56      0.57       149
      weighted avg       0.86      0.87      0.86       149



  _warn_prf(average, modifier, msg_start, len(result))


### Storing Linear SVM Model because of it's less size (>500 MB models can't be deployed on Heroku)

In [10]:
import joblib
filename = 'finalized_model.sav'
joblib.dump(sgd,filename)

['finalized_model.sav']