# **IMPORT LIBRARIES**

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import spacy

# **READ THE DATASET**

In [2]:
data = pd.read_csv("/content/Twitter Sentiments.csv")

# **EXPLORE THE DATASET**

**READ FIRST 5 DATA**

In [3]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


**REMOVE UNNECESSARY COLUMNS**

In [4]:
data.drop(columns = ['id'] , axis = 1 , inplace = True)

**COLUMN NAME**

In [5]:
data.columns

Index(['label', 'tweet'], dtype='object')

**CHECKING NULL VALUES**

In [6]:
data.isnull().sum()

label    0
tweet    0
dtype: int64

**ROWS & COLUMNS**

In [7]:
data.shape

(21602, 2)

**COLUMN - DATATYPE**

In [8]:
data.dtypes

label     int64
tweet    object
dtype: object

# **LOWER TEXT**

In [9]:
data['clean_tweet'] = data['tweet'].str.lower()

# **REGULAR EXPRESSION**

In [12]:
def rem(text , pat):
  w = re.findall(pat , text)

  for words in w:
    text = re.sub(words , "" , text)
  return text

In [13]:
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: rem(x ,'@user' ))
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: rem(x ,'#' ))
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: rem(x ,'!' ))
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: rem(x ,'$' ))
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: rem(x ,'%' ))
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: rem(x ,'/' ))
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: rem(x ,',' ))
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: rem(x ,':' ))
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: rem(x ,';' ))
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: rem(x ,'<' ))
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: rem(x ,'=' ))
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: rem(x ,'>' ))
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: rem(x ,'^' ))
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: rem(x ,'_' ))
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: rem(x ,'`' ))

In [14]:
data.head()

Unnamed: 0,label,tweet,clean_tweet
0,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i can't use cause the...
2,0,bihday your majesty,bihday your majesty
3,0,#model i love u take with u all the time in ...,model i love u take with u all the time in u...
4,0,factsguide: society now #motivation,factsguide society now motivation


# **STEMMING & STOP WORDS**

In [15]:
nlp = spacy.load("en_core_web_sm")

In [18]:
def preprocess(text):
  doc = nlp(text)
  filtered_text = []

  for token in doc:
    if token.is_stop or token.is_punct:
      continue
    filtered_text.append(token.lemma_)

  return " ".join(filtered_text)

In [19]:
data['clean_tweet'] = data['clean_tweet'].apply(preprocess)

In [20]:
data.head()

Unnamed: 0,label,tweet,clean_tweet
0,0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drag kid dysfu...
1,0,@user @user thanks for #lyft credit i can't us...,thank lyft credit use cause offer wheelchai...
2,0,bihday your majesty,bihday majesty
3,0,#model i love u take with u all the time in ...,model love u u time urð± ðððð ...
4,0,factsguide: society now #motivation,factsguide society motivation


# **MODEL BUILDING**

In [21]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [22]:
x_train , x_test , y_train , y_test = train_test_split(
    data['clean_tweet'],
    data['label'],
    test_size = 0.2
)

**KNN**

In [23]:
from sklearn.neighbors import KNeighborsClassifier

In [24]:
clf = Pipeline( [
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('KNN',KNeighborsClassifier())
])

In [25]:
clf.fit(x_train , y_train)

In [26]:
y_pred = clf.predict(x_test)

In [27]:
y_test[:10]

2030     0
1533     0
9971     0
15301    0
13380    0
6573     0
19251    0
1171     0
16874    0
8025     0
Name: label, dtype: int64

In [28]:
y_pred[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [29]:
clf.score(x_test , y_test)

0.9405230270770655

**DECISION TREE CLASSIFIER**

In [30]:
from sklearn.tree import DecisionTreeClassifier

In [31]:
dtc = Pipeline( [
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('DTC',DecisionTreeClassifier())
])

In [32]:
dtc.fit(x_train , y_train)

In [33]:
dtc.score(x_test , y_test)

0.9449201573709789

In [34]:
y_pred_dtc = dtc.predict(x_test)

In [36]:
y_test[50:60]

19746    0
10286    0
4972     0
8896     1
8166     0
3499     0
5567     1
7686     0
12318    0
9371     0
Name: label, dtype: int64

In [37]:
y_pred_dtc[50:60]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

**SUPPORT VECTOR CLASSIFIER**

In [38]:
from sklearn.svm import SVC

In [39]:
svc = Pipeline( [
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('SVC',SVC())
])

In [40]:
svc.fit(x_train , y_train)

In [41]:
svc.score(x_test , y_test)

0.9562601249710715

In [42]:
y_pred_svc = svc.predict(x_test)

In [43]:
y_test[50:60]

19746    0
10286    0
4972     0
8896     1
8166     0
3499     0
5567     1
7686     0
12318    0
9371     0
Name: label, dtype: int64

In [44]:
y_pred_svc[50:60]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

**RANDOM FOREST CLASSIFIER**

In [45]:
from sklearn.ensemble import RandomForestClassifier

In [46]:
rfc = Pipeline( [
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('RFC',RandomForestClassifier())
])

In [47]:
rfc.fit(x_train , y_train)

In [48]:
rfc.score(x_test , y_test)

0.9625086785466327

In [49]:
y_pred_rfc = rfc.predict(x_test)

In [50]:
y_test[11:20]

7481     0
5974     0
17025    0
1365     0
13033    0
7589     0
20167    0
11253    1
14565    1
Name: label, dtype: int64

In [51]:
y_pred_rfc[11:20]

array([0, 0, 0, 0, 0, 0, 0, 1, 1])