### Load the data

In [1]:
import pandas as pd
train = pd.read_csv('Constraint_Train.csv')
val = pd.read_csv('Constraint_Val.csv')
test_with_labels=pd.read_csv('english_test_with_labels.csv')
train.head(5)

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


### merge train and validation

In [17]:
train=pd.merge(train,val,how='outer')
train.head(5)

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


### function for cleaning text

In [18]:
import nltk
import re
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def clean_text(string):
    tx = string.lower()
    tx = re.sub(r"http(\S)+",' ',tx)    
    tx = re.sub(r"www(\S)+",' ',tx)
    tx = re.sub(r"&",' and ',tx)   
    tx = re.sub(r"[^0-9a-zA-Z]+",' ',tx)
    tx = tx.split()
    L=[]
    for w in tx:
        if not w in stop_words:
            L.append(w)
    tx=L
    tx = " ".join(tx)
    return tx

### an example for clean_text function

In [19]:
string='States reported 1121 deaths a small rise from last Tuesday. Southern states reported 640 of those deaths. https://t.co/YASGRTT4ux'
clean_text(string)


'states reported 1121 deaths small rise last tuesday southern states reported 640 deaths'

### Cleaning the text

In [20]:
train["tweet"] = train["tweet"].map(lambda x: clean_text(x))
test_with_labels["tweet"] = test_with_labels["tweet"].map(lambda x: clean_text(x))

### 5 row of train after cleaning

In [21]:
train.head(5)

Unnamed: 0,id,tweet,label
0,1,cdc currently reports 99031 deaths general dis...,real
1,2,states reported 1121 deaths small rise last tu...,real
2,3,politically correct woman almost uses pandemic...,fake
3,4,indiafightscorona 1524 covid testing laborator...,real
4,5,populous states generate large case counts loo...,real


### extract features from text

In [22]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(CountVectorizer(),TfidfTransformer())
pipeline.fit(train["tweet"])
x_train=pipeline.transform(train["tweet"])
x_test=pipeline.transform(test_with_labels["tweet"])
x_train

<8560x16282 sparse matrix of type '<class 'numpy.float64'>'
	with 135248 stored elements in Compressed Sparse Row format>

### function for print metrics

In [23]:
from sklearn.metrics import confusion_matrix,precision_score, recall_score,f1_score,accuracy_score
def metrics(truelabel,predlabel):
    print("Confusion_matrix : ",confusion_matrix(truelabel,predlabel))
    print("Accuracy : ",accuracy_score(truelabel,predlabel))
    print("Precison : ",precision_score(truelabel,predlabel, average = 'weighted'))
    print("Recall : ",recall_score(truelabel,predlabel, average = 'weighted'))
    print("F1 : ",f1_score(truelabel,predlabel, average = 'weighted'))

In [24]:
import warnings
warnings.filterwarnings('ignore')

## SVM

### evaluate SVM model on test data before hyperparameter tuning

In [11]:
from sklearn.svm import SVC
svmmodel=SVC()
svmmodel=svmmodel.fit(x_train,train["label"])
pred=svmmodel.predict(x_test)
metrics(test_with_labels["label"],pred)

Confusion_matrix :  [[ 975   45]
 [  84 1036]]
Accuracy :  0.9397196261682244
Precison :  0.9404065362883206
Recall :  0.9397196261682244
F1 :  0.9397509659412703


### tuning hyper parameters in SVM

In [104]:
from sklearn.model_selection import GridSearchCV

param={'C' : [1000, 100, 10, 1.0, 0.1, 0.001],
       'kernel' : ['linear', 'rbf', 'poly','sigmoid']
      }
grid=GridSearchCV(SVC(),param_grid=param,cv=5,n_jobs=-1,verbose=1,scoring="accuracy")
best_svmmodel=grid.fit(x_train,train["label"])
print("best hyper parameters are: ",best_svmmodel.best_params_)
print("best score is: ",best_svmmodel.best_score_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
best hyper parameters are:  {'C': 1.0, 'kernel': 'linear'}
best score is:  0.9404205607476636


### evaluate SVM model on test data after hyperparameter tuning

In [105]:
best_pred=best_svmmodel.predict(x_test)
metrics(test_with_labels["label"],best_pred)

Confusion_matrix :  [[ 962   58]
 [  74 1046]]
Accuracy :  0.9383177570093458
Precison :  0.9384590081461273
Recall :  0.9383177570093458
F1 :  0.938335887165027


## Logistic Regression

### evaluate Logistic Regression model on test data before hyperparameter tuning

In [87]:
from sklearn.linear_model import LogisticRegression
lrmodel=LogisticRegression()
lrmodel=lrmodel.fit(x_train,train["label"])
pred=lrmodel.predict(x_test)
metrics(test_with_labels["label"],pred)

Confusion_matrix :  [[ 956   64]
 [ 100 1020]]
Accuracy :  0.9233644859813084
Precison :  0.9239643129226535
Recall :  0.9233644859813084
F1 :  0.9234030758774813


### tuning hyper parameters in Logistic Regression

In [91]:
param={'solver' : ['newton-c', 'lbfgs', 'liblinear', 'sag', 'saga'],
       'penalty' : ['none', 'l1', 'l2', 'elasticnet'],
       'C' : [100, 10, 1.0, 0.1, 0.01]
      }
grid=GridSearchCV(LogisticRegression(),param_grid=param,cv=5,n_jobs=-1,verbose=1,scoring="accuracy")
best_lrmodel=grid.fit(x_train,train["label"])
print("best hyper parameters are: ",best_lrmodel.best_params_)
print("best score is: ",best_lrmodel.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
best hyper parameters are:  {'C': 1.0, 'penalty': 'none', 'solver': 'saga'}
best score is:  0.938785046728972


### evaluate Logistic Regression model on test data after hyperparameter tuning

In [93]:
best_pred=best_lrmodel.predict(x_test)
metrics(test_with_labels["label"],best_pred)

Confusion_matrix :  [[ 946   74]
 [  69 1051]]
Accuracy :  0.9331775700934579
Precison :  0.9331724240238581
Recall :  0.9331775700934579
F1 :  0.9331698911616902


## Decision Tree

### evaluate Decision Tree model on test data before hyperparameter tuning

In [100]:
from sklearn import tree
dtmodel=tree.DecisionTreeClassifier()
dtmodel=dtmodel.fit(x_train,train["label"])
pred=dtmodel.predict(x_test)
metrics(test_with_labels["label"],pred)

Confusion_matrix :  [[879 141]
 [123 997]]
Accuracy :  0.8766355140186916
Precison :  0.8766451624451769
Recall :  0.8766355140186916
F1 :  0.8765781235756938


### tuning hyper parameters in Decision tree

In [102]:
param={'max_depth' : range(1,30),
       'criterion' : ["gini","entropy"],
       'min_samples_leaf' : range(1,10)
      }
grid=GridSearchCV(tree.DecisionTreeClassifier(),param_grid=param,cv=5,n_jobs=-1,verbose=1,scoring="accuracy")
best_dtmodel=grid.fit(x_train,train["label"])
print("best hyper parameters are: ",best_dtmodel.best_params_)
print("best score is: ",best_dtmodel.best_score_)

Fitting 5 folds for each of 522 candidates, totalling 2610 fits
best hyper parameters are:  {'criterion': 'gini', 'max_depth': 28, 'min_samples_leaf': 1}
best score is:  0.8711448598130842


### evaluate Decision Tree model on test data after hyperparameter tuning

In [103]:
best_pred=best_dtmodel.predict(x_test)
metrics(test_with_labels["label"],best_pred)

Confusion_matrix :  [[900 120]
 [154 966]]
Accuracy :  0.8719626168224299
Precison :  0.8725284279710915
Recall :  0.8719626168224299
F1 :  0.8720254147601684


# GradientBoosting

### evaluate GradientBoosting model on test data before hyperparameter tuning

In [19]:
from sklearn.ensemble import GradientBoostingClassifier

grBoostingClf=GradientBoostingClassifier()
grBoostingClf.fit(x_train,train["label"])
pred=grBoostingClf.predict(x_test)
metrics(test_with_labels["label"],pred)

Confusion_matrix :  [[924  96]
 [180 940]]
Accuracy :  0.8710280373831776
Precison :  0.8737905927056678
Recall :  0.8710280373831776
F1 :  0.8710658895990753


### tuning hyper parameters in GradientBoosting

In [18]:
import numpy as np
from sklearn.model_selection import GridSearchCV

params = {
    'learning_rate':[0.01,0.05,0.1],

    'n_estimators':np.arange(100,500,100),
}
gradientBoosting = GradientBoostingClassifier()
gradientoosting_gridSearch = GridSearchCV(gradientBoosting, params, cv = 4)
best_gbmodel=gradientoosting_gridSearch.fit(x_train,train["label"])

print("best hyper parameters are: ",best_gbmodel.best_params_)
print("best score is: ",best_gbmodel.best_score_)

best hyper parameters are:  {'learning_rate': 0.1, 'n_estimators': 400}
best score is:  0.9033878504672898


### evaluate GradientBoosting model on test data after hyperparameter tuning

In [20]:
best_pred=best_gbmodel.predict(x_test)
metrics(test_with_labels["label"],best_pred)

Confusion_matrix :  [[937  83]
 [125 995]]
Accuracy :  0.902803738317757
Precison :  0.9036026806500433
Recall :  0.902803738317757
F1 :  0.9028554773656248


## Multinomial Naive Bayes

### evaluate Multinomial Naive Bayes model on test data before hyperparameter tuning

In [25]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()
mnb.fit(x_train,train["label"])
pred=mnb.predict(x_test)
metrics(test_with_labels["label"],pred)

Confusion_matrix :  [[ 892  128]
 [  54 1066]]
Accuracy :  0.9149532710280374
Precison :  0.9166864003948798
Recall :  0.9149532710280374
F1 :  0.9147125623684118


### tuning hyper parameters in Multinomial Naive Bayes

In [30]:
from sklearn.model_selection import GridSearchCV
param={
    'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
      }
grid=GridSearchCV(MultinomialNB(),param_grid=param,cv=5,n_jobs=-1,verbose=1,scoring="accuracy")
best_mnbmodel=grid.fit(x_train,train["label"])
print("best hyper parameters are: ",best_mnbmodel.best_params_)
print("best score is: ",best_mnbmodel.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
best hyper parameters are:  {'alpha': 0.1}
best score is:  0.9232476635514019


### evaluate Multinomial Naive Bayes model on test data after hyperparameter tuning

In [31]:
best_pred=best_mnbmodel.predict(x_test)
metrics(test_with_labels["label"],best_pred)

Confusion_matrix :  [[ 927   93]
 [  66 1054]]
Accuracy :  0.9257009345794392
Precison :  0.9258853398390909
Recall :  0.9257009345794392
F1 :  0.9256451061332132
