<a href="https://colab.research.google.com/github/MiHarsh/CodaLab-SharedTask/blob/main/Conventional_Approaches_Kmeans_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import required libraries

In [1]:
!nvidia-smi -L

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-dda12bca-4dcd-2307-4676-9002282b46f3)


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report


import sys
import re
!pip install emoji --quiet
import emoji
!pip install contractions --quiet
import contractions
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import unicodedata

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Dataset

In [2]:
train_datapath="https://raw.githubusercontent.com/MiHarsh/MiHarsh/master/Constraint_English_Train%20-%20Sheet1.csv"
val_datapath  ="https://raw.githubusercontent.com/MiHarsh/MiHarsh/master/Constraint_English_Val%20-%20Sheet1.csv"
train         = pd.read_csv(train_datapath)
valid         = pd.read_csv(val_datapath)
total         = pd.concat([train,valid],ignore_index=True)
mix           = total.iloc[:,1:]
mix

Unnamed: 0,tweet,label
0,The CDC currently reports 99031 deaths. In gen...,real
1,States reported 1121 deaths a small rise from ...,real
2,Politically Correct Woman (Almost) Uses Pandem...,fake
3,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,Populous states can generate large case counts...,real
...,...,...
8555,Donald Trump wrongly claimed that New Zealand ...,fake
8556,Current understanding is #COVID19 spreads most...,real
8557,Nothing screams “I am sat around doing fuck al...,fake
8558,Birx says COVID-19 outbreak not under control ...,fake


## Data Cleaning

In [3]:
def cleaning(text):
  text= text.lower()
  text= emoji.demojize(text)
  text=contractions.fix(text)
  text=text.strip()
  text=text.replace('[^\w\s]','')
  text=re.sub(r'http\S+', '', text)
  REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
  BAD_SYMBOLS_RE = re.compile('[^0-9a-z +]')
  text = REPLACE_BY_SPACE_RE.sub(' ' , text)
  text = BAD_SYMBOLS_RE.sub(' ',text)
  
  return text

clean=mix['tweet'].apply(cleaning)
STOPWORDS = set(stopwords.words('english'))

ff=[]
for i in clean:
  text=unicodedata.normalize('NFKD', i).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  ff.append(text)
dd=pd.DataFrame(ff)
dataset = pd.concat([dd,mix['label']],axis=1)
dataset.label = dataset.label.map({'real':0,'fake':1})
dataset

Unnamed: 0,0,label
0,the cdc currently reports 99031 deaths in gen...,0
1,states reported 1121 deaths a small rise from ...,0
2,politically correct woman almost uses pandem...,1
3,indiafightscorona we have 1524 covid testin...,0
4,populous states can generate large case counts...,0
...,...,...
8555,donald trump wrongly claimed that new zealand ...,1
8556,current understanding is covid19 spreads most...,0
8557,nothing screams i am sat around doing fuck al...,1
8558,birx says covid 19 outbreak not under control ...,1


## Models for Conventional Approaches

In [4]:
Km   = KMeans(n_clusters=2, random_state=0)
KNN  = KNeighborsClassifier(n_neighbors=3)

## Count Vector

In [5]:
cv= CountVectorizer()
transform = cv.fit_transform(dataset[0].values.astype('U'))
x_train= transform[:len(train)]                                           #trainset 
x_valid= transform[len(train):]                                           #validset
print(x_train.shape)
print(x_valid.shape)

(6420, 16488)
(2140, 16488)


### Training the models

In [6]:
Km.fit(x_train,dataset['label'][:len(train)])
KNN.fit(x_train,dataset['label'][:len(train)])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

### Prediction of labels for Valid dataset using Count Vector

In [7]:
y_pred1= Km.predict(x_valid)
y_pred2= KNN.predict(x_valid)


In [8]:
for i in range(2):
  ac=accuracy_score(dataset['label'][len(train):], globals()['y_pred'+str(int(i+1))])
  print(ac)
  print(classification_report(dataset['label'][len(train):], globals()['y_pred'+str(int(i+1))]))

0.4766355140186916
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1120
           1       0.48      1.00      0.65      1020

    accuracy                           0.48      2140
   macro avg       0.24      0.50      0.32      2140
weighted avg       0.23      0.48      0.31      2140

0.75
              precision    recall  f1-score   support

           0       0.92      0.57      0.70      1120
           1       0.67      0.95      0.78      1020

    accuracy                           0.75      2140
   macro avg       0.80      0.76      0.74      2140
weighted avg       0.80      0.75      0.74      2140



  _warn_prf(average, modifier, msg_start, len(result))


## Tf-IDf

In [9]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset[0])
X_train= X[:len(train)]
X_valid= X[len(train):]

In [10]:
Km   = KMeans(n_clusters=2, random_state=0)
KNN  = KNeighborsClassifier(n_neighbors=3)

In [11]:
Km.fit(X_train,dataset['label'][:len(train)])
KNN.fit(X_train,dataset['label'][:len(train)])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [12]:
Y_pred1= Km.predict(X_valid)
Y_pred2= KNN.predict(X_valid)

In [13]:
for i in range(2):
  ac=accuracy_score(dataset['label'][len(train):], globals()['Y_pred'+str(int(i+1))])
  print(ac)
  print(classification_report(dataset['label'][len(train):], globals()['Y_pred'+str(int(i+1))]))

0.3574766355140187
              precision    recall  f1-score   support

           0       0.43      0.67      0.52      1120
           1       0.04      0.02      0.02      1020

    accuracy                           0.36      2140
   macro avg       0.24      0.34      0.27      2140
weighted avg       0.24      0.36      0.28      2140

0.9107476635514019
              precision    recall  f1-score   support

           0       0.91      0.93      0.92      1120
           1       0.92      0.90      0.91      1020

    accuracy                           0.91      2140
   macro avg       0.91      0.91      0.91      2140
weighted avg       0.91      0.91      0.91      2140

