# Importing necessary libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Load data from csv file

In [2]:
df=pd.read_csv(r"C:\Users\Merin\Downloads\COVIDSenti-A.csv")
df.tail()

Unnamed: 0,tweet,label
29995,CDC: Re-test confirms Westerdam cruise ship pa...,neu
29996,Two doctors die of coronavirus within 24 hours...,neu
29997,BEIJING - The lockdown of Guo Jing's neighbour...,neu
29998,#CoronavirusOutbreak in #Balochistan !!\n#CPEC...,neu
29999,The Australian dollar has hit a fresh decade l...,neu


In [3]:
df.shape

(30000, 2)

In [4]:
df.label.value_counts()

neu    22949
neg     5083
pos     1968
Name: label, dtype: int64

In [5]:
df.isnull().sum()

tweet    0
label    0
dtype: int64

In [6]:
df['label'].unique()

array(['neu', 'neg', 'pos'], dtype=object)

# Cleaning the text

In [7]:
import re
doc_clean = []
for i in df['tweet']:
    doc = re.sub(r'[^\x00-\x7F]+', ' ', i)
    doc = re.sub(r'@\w+', '', doc)
    doc = doc.lower()
    doc = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', doc)
    doc = re.sub(r'[0-9]', '', doc)
    doc = re.sub(r'\s{2,}', ' ', doc)
    doc_clean.append(doc)

# Adding  clean_tweet column to the dataframe df

In [8]:
df['clean_tweet'] = doc_clean
df.head()

Unnamed: 0,tweet,label,clean_tweet
0,Coronavirus | Human Coronavirus Types | CDC ht...,neu,coronavirus | human coronavirus types | cdc
1,"@shehryar_taseer That‚Äôs üíØ true , \nCorona...",neu,"that s true , corona virus swine flue bird fl..."
2,"TLDR: Not SARS, possibly new coronavirus. Diff...",neg,"tldr: not sars, possibly new coronavirus. diff..."
3,Disease outbreak news from the WHO: Middle Eas...,neu,disease outbreak news from the who: middle eas...
4,China - Media: WSJ says sources tell them myst...,neu,china - media: wsj says sources tell them myst...


In [9]:
x=df['clean_tweet']
y=df['label']

In [10]:
print(x.head())

0         coronavirus | human coronavirus types | cdc 
1     that s true , corona virus swine flue bird fl...
2    tldr: not sars, possibly new coronavirus. diff...
3    disease outbreak news from the who: middle eas...
4    china - media: wsj says sources tell them myst...
Name: clean_tweet, dtype: object


# Creating feature vectors of x 

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
x_vectors = vectorizer.fit_transform(x)

# Splitting into train and test

In [13]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_vectors, y, test_size=0.2, random_state=42)

# Model Building

In [14]:
from sklearn import svm
from sklearn.metrics import classification_report

In [15]:
model = svm.SVC(kernel='linear')
model.fit(x_train, y_train)

SVC(kernel='linear')

In [16]:
y_pred = model.predict(x_test)

In [17]:
report = classification_report(y_test, y_pred, output_dict=True)
print('positive: ', report['pos'])
print('negative: ', report['neg'])

positive:  {'precision': 0.8517241379310345, 'recall': 0.5853080568720379, 'f1-score': 0.6938202247191012, 'support': 422}
negative:  {'precision': 0.8812989921612542, 'recall': 0.7846460618145563, 'f1-score': 0.8301687763713079, 'support': 1003}


In [21]:
tweet = """Many people took the vaccine"""
vec = vectorizer.transform([tweet]) # vectorizing
print(model.predict(vec))

['pos']


In [22]:
tweet = """It was a very difficult situation"""
vec = vectorizer.transform([tweet]) # vectorizing
print(model.predict(vec))

['neg']


In [23]:
tweet = """The life was too boring during covid"""
vec = vectorizer.transform([tweet]) # vectorizing
print(model.predict(vec))

['neu']
