<div style="text-align:center">
    <h1>SVM Classification - SVC</h1>
    <h3>Project: Coronavirus tweets classification</h3>
    <h4><a href="https://amzenterprise.ir/">Ali Momenzadeh</a></h5>
</div>

Dataset on Kaggle: https://www.kaggle.com/datasets/datatattle/covid-19-nlp-text-classification

#### Import libraries

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import time
import re

import warnings as wrn
wrn.filterwarnings('ignore')

#### Load and prepare data

In [None]:
train_set = pd.read_csv('Corona_NLP_train.csv',encoding="latin1")
test_set = pd.read_csv('Corona_NLP_test.csv',encoding="latin1")

train_set.head()

##### Drop unrelevant features

In [None]:
unrelevant_features = ["UserName","ScreenName","Location","TweetAt"]

train_set.drop(unrelevant_features,inplace=True,axis=1)
test_set.drop(unrelevant_features,inplace=True,axis=1)
train_set.head()

##### Frequency of each Sentiment

In [None]:
train_set["Sentiment"].value_counts()

##### Encoding categorical variables

In [None]:
positives = train_set[(train_set["Sentiment"] == "Positive") | (train_set["Sentiment"] == "Extremely Positive")]
positives_test = test_set[(test_set["Sentiment"] == "Positive") | (test_set["Sentiment"] == "Extremely Positive")]
print(positives["Sentiment"].value_counts())
positives.head()

In [None]:
negatives = train_set[(train_set["Sentiment"] == "Negative") | (train_set["Sentiment"] == "Extremely Negative")]
negatives_test = test_set[(test_set["Sentiment"] == "Negative") | (test_set["Sentiment"] == "Extremely Negative")]
print(negatives["Sentiment"].value_counts())
negatives.head()

In [None]:
neutrals = train_set[train_set["Sentiment"] == "Neutral"]
neutrals_test = test_set[test_set["Sentiment"] == "Neutral"]
print(neutrals["Sentiment"].value_counts())
neutrals.head()

In [None]:
negatives["Sentiment"] = 0 
negatives_test["Sentiment"] = 0

positives["Sentiment"] = 2
positives_test["Sentiment"] = 2

neutrals["Sentiment"] = 1
neutrals_test["Sentiment"] = 1

negatives.head()

In [None]:
data = pd.concat([positives,
                  positives_test,
                  neutrals,
                  neutrals_test,
                  negatives,
                  negatives_test
                 ],axis=0)

data.reset_index(inplace=True)

In [None]:
data.info()

In [None]:
data.head()

##### Randomly select data points for examination

In [None]:
import random
for i in range(1,10):
    random_ind = random.randint(0,len(data))
    print(str(data["OriginalTweet"][random_ind]),end="\nLabel: ")
    print(str(data["Sentiment"][random_ind]),end="\n\n")

##### Frequency distributions

In [None]:
positiveFD = nltk.FreqDist(word for text in data[data["Sentiment"] == 2]["OriginalTweet"] for word in text.lower().split())
negativeFD = nltk.FreqDist(word  for text in data[data["Sentiment"] == 0]["OriginalTweet"] for word in text.lower().split())
neutralDF = nltk.FreqDist(word  for text in data[data["Sentiment"] == 1]["OriginalTweet"] for word in text.lower().split())

##### Most used words

In [None]:
plt.subplots(figsize=(8,6))
plt.title("Most Used Words in Positive Tweets")
positiveFD.plot(50)
plt.show()

In [None]:
plt.subplots(figsize=(8,6))
plt.title("Most Used Words in Negative Tweets")
negativeFD.plot(50)
plt.show()

In [None]:
plt.subplots(figsize=(8,6))
plt.title("Most Used Words in Neutral Tweets")
neutralDF.plot(50)
plt.show()

#### Data preprocessing

Lemmatization is the process of grouping together the different inflected forms of a word so they can be analyzed as a single item. Lemmatization is similar to stemming but it brings context to the words. So it links words with similar meanings to one word.

In [None]:
cleanedData = []

lemma = WordNetLemmatizer()
swords = stopwords.words("english") #In English, “the”, “is” and “and”, would easily qualify as stop words.
for text in data["OriginalTweet"]:
    
    # Cleaning links
    text = re.sub(r'http\S+', '', text)
    
    # Cleaning everything except alphabetical and numerical characters
    text = re.sub("[^a-zA-Z0-9]"," ",text)
    
    # Tokenizing and lemmatizing
    text = nltk.word_tokenize(text.lower())
    text = [lemma.lemmatize(word) for word in text]
    
    # Removing stopwords
    text = [word for word in text if word not in swords]
    
    # Joining
    text = " ".join(text)
    
    cleanedData.append(text)

In [None]:
for i in range(0,5):
    print(cleanedData[i],end="\n\n")

<img src="https://miro.medium.com/max/714/1*UOjWvDziH86T2MmiDpp98Q.png">

<img src="https://raw.githubusercontent.com/cassieview/intro-nlp-wine-reviews/master/imgs/vectorchart.PNG">

In [None]:
vectorizer = CountVectorizer(max_features=10000)

# Bag Of Words
BOW = vectorizer.fit_transform(cleanedData)

In [None]:
BOW

#### Training and Test 

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(BOW,np.asarray(data["Sentiment"]))

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.svm import SVC
start_time = time.time()

model = SVC()
model.fit(X_train,y_train)

end_time = time.time()
process_time = round(end_time-start_time,2)
print("Fitting SVC took {} seconds".format(process_time))

In [None]:
predictions = model.predict(X_test)

#### Evaluation

In [None]:
from sklearn.metrics import accuracy_score

print("Accuracy of the model is {}%".format(accuracy_score(y_test,predictions) * 100))

<img src="http://rasbt.github.io/mlxtend/user_guide/evaluate/confusion_matrix_files/confusion_matrix_1.png">

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test,predictions))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test,predictions))