In [None]:
#imports
import numpy
import nltk
import pandas as pd
import seaborn as sns
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import string
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
%matplotlib inline


#stopwords needs to be downloaded
nltk.download('stopwords')  


#reading from files
messages = pd.read_csv('train2.tsv',sep = '\t',names = ['ID','label','statement','subject','speaker','speaker\'s job title','state','party','barely true counts','false counts', 'half true counts','mostly true counts','pants on fire counts','context','justification'])
test_messages = pd.read_csv('test2.tsv',sep = '\t',names = ['ID','label','statement','subject','speaker','speaker\'s job title','state','party','barely true counts','false counts', 'half true counts','mostly true counts','pants on fire counts','context','justification'])


#DEALING WITH THE MISSING DATA
messages['speaker\'s job title'].fillna('NA',inplace = True)
messages['state'].fillna('NA',inplace = True)
messages['context'].fillna('NA',inplace = True)
messages.dropna(inplace = True)
test_messages['speaker\'s job title'].fillna('NA',inplace = True)
test_messages['state'].fillna('NA',inplace = True)
test_messages['context'].fillna('NA',inplace = True)
test_messages.dropna(inplace = True)


def makeBinary(label):       #TO ADD ANOTHER FEATURE WHICH TELLS WHETHER A STATEMENT IS TRUE OR FALSE
    if label in ['false','barely-true','pants-fire']:
        return 'False'
    else:
        return 'True'
def text_process(msg):      #FOR TEXT DATA CLEANING
    npunct = [c for c in msg if c not in string.punctuation]
    npunct = ''.join(npunct)
    return [w for w in npunct.split() if w.lower() not in stopwords.words('english')]

messages['binary label'] = messages['label'].apply(makeBinary) 
test_messages['binary_label'] = test_messages['label'].apply(makeBinary)


################################################## RANDOM FOREST MODEL ########################################################
#USING RANDOM FOREST CLASSIFIER BY CREATING A PIPELINE
pipeline = Pipeline([('bow',CountVectorizer(analyzer = text_process)),
                     ('tfidf',TfidfTransformer()),
                     ('classifier',RandomForestClassifier())
                    ]) 
pipeline.fit(messages['statement'],messages['label'])  #FITTING THE RANDOM FOREST MODEL FOR SIX-WAY FEATURES
pred = pipeline.predict(test_messages['statement'])   #PREDICTING
print('\n\n#### RANDOM FOREST MODEL FOR SIX-WAY FEATURES ####\n')
print(confusion_matrix(test_messages['label'],pred))   #EVALUATING THE PREDICTIONS
print('\n')
print(classification_report(test_messages['label'],pred))

## SIX-WAY ACCURACY (RANDOM FOREST CLASSIFIER) = 0.21 ## 

pipeline.fit(messages['statement'],messages['binary label'])   #FITTING THE RANDOM FOREST MODEL FOR BINARY FEATURES
pred = pipeline.predict(test_messages['statement'])   #PREDICTING
print('\n\n#### RANDOM FOREST MODEL FOR BINARY FEATURES ####\n')
print(confusion_matrix(test_messages['binary_label'],pred))    #EVALUATING THE PREDICTIONS
print('\n')
print(classification_report(test_messages['binary_label'],pred))

## BINARY ACCURACY (RANDOM FOREST CLASSIFIER) = 0.58 ## 


################################################## MULTINOMIALNB MODEL ########################################################
#USING MULTINOMIALNB CLASSIFIER BY CREATING THE PIPELINE
pipeline = Pipeline([('bow',CountVectorizer(analyzer = text_process)),
                     ('tfidf',TfidfTransformer()),
                     ('classifier',MultinomialNB())
                    ])
pipeline.fit(messages['statement'],messages['label'])   #FITTING THE MULTINOMIALNB MODEL FOR SIX-WAY FEATURES
pred = pipeline.predict(test_messages['statement'])   #PREDICTING
print('\n\n#### MULTINOMIALNB MODEL FOR SIX-WAY FEATURES ####\n')
print(confusion_matrix(test_messages['label'],pred))   #EVALUATING THE PREDICTIONS
print('\n')
print(classification_report(test_messages['label'],pred))

## SIX-WAY ACCURACY (MULTINOMIALNB CLASSIFIER) = 0.24 ##

pipeline.fit(messages['statement'],messages['binary label'])   #FITTING THE MULTINOMIALNB MODEL FOR BINARY FEATURES
pred = pipeline.predict(test_messages['statement'])   #PREDICTING
print('\n\n#### MULTINOMIALNB MODEL FOR BINARY FEATURES ####\n')
print(confusion_matrix(test_messages['binary_label'],pred))   #EVALUATING THE PREDICTIONS
print('\n')
print(classification_report(test_messages['binary_label'],pred))

## BINARY ACCURACY (MULTINOMIALNB CLASSIFIER) = 0.60 ## 


################################################## LOGISTIC REGRESSION MODEL ########################################################
#USING LOGISTIC REGRESSION BY CREATING THE PIPELINE
pipeline = Pipeline([('bow',CountVectorizer(analyzer = text_process)),
                     ('tfidf',TfidfTransformer()),
                     ('classifier',LogisticRegression())
                    ])
pipeline.fit(messages['statement'],messages['label'])    #FITTING THE LOGISTIC REGRESSION MODEL FOR SIX-WAY FEATURES
pred = pipeline.predict(test_messages['statement'])    #PREDICTING
print('\n\n#### LOGISTIC REGRESSION MODEL FOR SIX-WAY FEATURES ####\n')
print(confusion_matrix(test_messages['label'],pred))    #EVALUATING THE PREDICTIONS
print('\n')
print(classification_report(test_messages['label'],pred))

## SIX-WAY ACCURACY (LOGISTIC REGRESSION) = 0.25 ## 

pipeline.fit(messages['statement'],messages['binary label'])   #FITTING THE LOGISTIC REGRESSION MODEL FOR BINARY FEATURES
pred = pipeline.predict(test_messages['statement'])   #PREDICTING
print('\n\n#### LOGISTIC REGRESSION MODEL FOR BINARY FEATURES ####\n')
print(confusion_matrix(test_messages['binary_label'],pred))   #EVALUATING THE PREDICTIONS
print('\n')
print(classification_report(test_messages['binary_label'],pred))

## BINARY ACCURACY (LOGISTIC REGRESSION) = 0.61 ## 


############################################################## SVM MODEL ########################################################
#USING SVM AND USING GRIDSEARCHSV TO FIND THE OPTIMUM VALUES FOR SVC PARAMETERS C AND gamma
param_grid = {'C':[0.1,1,10,100,1000],'gamma':[1,0.1,0.01,0.001,0.0001]}
pipeline = Pipeline([('bow',CountVectorizer(analyzer = text_process)),
                     ('tfidf',TfidfTransformer()),
                     ('classifier',GridSearchCV(SVC(),param_grid,verbose = 3))
                    ])
pipeline.fit(messages['statement'],messages['label'])   #FITTING THE SVM MODEL FOR SIX-WAY FEATURES (THIS TAKES SOME TIME TO RUN DEPENDING ON COMPUTER SPEED)
pred = pipeline.predict(test_messages['statement'])   #PREDICTING
print('\n\n#### SVM MODEL FOR SIX-WAY FEATURES ####\n')
print(confusion_matrix(test_messages['label'],pred))
print('\n')
print(classification_report(test_messages['label'],pred))

## SIX-WAY ACCURACY (SVM) = 0.25 ##

pipeline.fit(messages['statement'],messages['binary label'])   #FITTING THE SVM MODEL FOR BINARY FEATURES (THIS TAKES SOME TIME TO RUN DEPENDING ON COMPUTER SPEED)
pred = pipeline.predict(test_messages['statement'])   #PREDICTING
print('\n\n#### SVM MODEL FOR BINARY FEATURES ####\n')
print(confusion_matrix(test_messages['binary_label'],pred))
print('\n')
print(classification_report(test_messages['binary_label'],pred))

## BINARY ACCURACY (SVM) = 0.62 ##

#--------------------------------------------------------------------------------------------------------------------------------