#Start

In [None]:
import pandas as pd
import re

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Code Practice/Datasets/Tweets.csv')

In [None]:
data.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


# Text Cleaning

## To Lowercase

In [None]:
data["cleaned_text"] = data["text"].apply(lambda x: str(x).lower())

In [None]:
temp = []
for txt in data["cleaned_text"]:
  temp.append(re.sub("\d+",r"",txt))

In [None]:
data["cleaned_text"] = temp

## Remove Punctuations and Special Chars

### Special Chars and Punctuations

In [None]:
temp = []
for txt in data["cleaned_text"]:
    temp.append(re.sub("[^a-zA-Z \n]",r"",txt))

In [None]:
data["cleaned_text"] = temp

## Remove Stopwords

In [None]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# stop_words = stopwords.words('english')
# temp_list = []
# temp_2 = ""
# for txt in data["text"]:
#   words = word_tokenize(str(txt))
#   temp = [word for word in words if not word in stop_words]
#   for t in temp:
#     temp_2 = temp_2+" "+t
#   temp_list.append(temp_2)

In [None]:
stop_words = stopwords.words('english')

In [None]:
def remove_stopwords(tweets):
  words = word_tokenize(tweets)
  temp = [word for word in words if not word in stop_words]
  return " ".join(temp)

temp_list = data["cleaned_text"].apply(remove_stopwords)

In [None]:
data["cleaned_text"] = temp_list

# Stemming

In [None]:
from nltk.stem import PorterStemmer
pd = PorterStemmer()

In [None]:
def stem_words(tweets):
  words = word_tokenize(tweets)
  temp = [pd.stem(word) for word in words]
  return " ".join(temp)

temp_list = data["cleaned_text"].apply(stem_words)

In [None]:
data["cleaned_text"] = temp_list

# Lemmatizing

In [None]:
from nltk import WordNetLemmatizer
lm = WordNetLemmatizer()

In [None]:
def lemma(tweets):
  words = word_tokenize(tweets)
  temp = [lm.lemmatize(word) for word in words]
  return " ".join(temp)

temp_list = data["cleaned_text"].apply(lemma)

In [None]:
temp_list

0                                            id respond go
1                                  sooo sad miss san diego
2                                                bos bulli
3                                      interview leav alon
4                    son couldnt put releas alreadi bought
                               ...                        
27476    wish could come see u denver husband lost job ...
27477    ive wonder rake client made clear net dont for...
27478    yay good enjoy break probabl need hectic weeke...
27479                                                worth
27480                           flirt go atg smile yay hug
Name: cleaned_text, Length: 27481, dtype: object

In [None]:
data.head()

Unnamed: 0,textID,text,selected_text,sentiment,cleaned_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,id respond go
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad miss san diego
2,088c60f138,my boss is bullying me...,bullying me,negative,boss bulli
3,9642c003ef,what interview! leave me alone,leave me alone,negative,interview leav alon
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,son couldnt put releas alreadi bought


#Model Building

## CountVectorizor

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vector = CountVectorizer(max_df=0.90,
                            min_df=2,
                            max_features = 1000,
                            stop_words = 'english')

In [None]:
bow = bow_vector.fit_transform(data["cleaned_text"])

In [None]:
bow_vector_names = bow_vector.get_feature_names_out()

In [None]:
import pandas as pd
model_dataframe = pd.DataFrame(bow.toarray(),columns=bow_vector_names)

In [None]:
model_dataframe

Unnamed: 0,abl,absolut,access,account,ach,act,actual,ad,add,addict,...,yesterday,yo,youll,young,youtub,youv,yr,yum,yummi,yup
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27476,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27477,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27478,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27479,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Train Test Split

In [None]:
data.head()

Unnamed: 0,textID,text,selected_text,sentiment,cleaned_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,id respond go
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad miss san diego
2,088c60f138,my boss is bullying me...,bullying me,negative,boss bulli
3,9642c003ef,what interview! leave me alone,leave me alone,negative,interview leav alon
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,son couldnt put releas alreadi bought


In [None]:
y = data["sentiment"]

In [None]:
from sklearn.model_selection import train_test_split
x_train_bow, x_test_bow, y_train, y_test = train_test_split(model_dataframe, y
                                                          , random_state=42, test_size=0.3)

#Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lgr = LogisticRegression()

In [None]:
lgr.fit(x_train_bow,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
lgr.score(x_train_bow,y_train)

0.7335204824287793

In [None]:
predictions = lgr.predict(x_test_bow)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report

In [None]:
confusion_matrix(y_test,predictions)

array([[1405,  772,  161],
       [ 442, 2516,  413],
       [  95,  627, 1814]])

In [None]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

    negative       0.72      0.60      0.66      2338
     neutral       0.64      0.75      0.69      3371
    positive       0.76      0.72      0.74      2536

    accuracy                           0.70      8245
   macro avg       0.71      0.69      0.69      8245
weighted avg       0.70      0.70      0.70      8245



#Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gbn = GaussianNB()

In [None]:
gbn.fit(x_train_bow,y_train)

In [None]:
gbn.score(x_train_bow,y_train)

0.5375337908089

In [None]:
predictgbn = gbn.predict(x_test_bow)

In [None]:
print(classification_report(y_test,predictgbn))

              precision    recall  f1-score   support

    negative       0.53      0.63      0.58      2338
     neutral       0.52      0.18      0.27      3371
    positive       0.46      0.77      0.58      2536

    accuracy                           0.49      8245
   macro avg       0.50      0.53      0.48      8245
weighted avg       0.50      0.49      0.45      8245



# Changing to Bi-Gram and Building the Model again

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vector = CountVectorizer(max_df=0.90,
                            min_df=2,
                            ngram_range=(3,3),
                            max_features = 1000,
                            stop_words = 'english')

In [None]:
bow = bow_vector.fit_transform(data["cleaned_text"])

In [None]:
bow_vector_names = bow_vector.get_feature_names_out()
print(bow_vector_names)

['abl follow anyon' 'alon friday night' 'anoth friday night'
 'anyon wan na' 'aw im sorri' 'aww feel bad' 'aww im sorri'
 'bank holiday monday' 'bed good night' 'bed happi mother'
 'bed nighti night' 'bee poli fibe' 'best day life' 'big bang theori'
 'big hair rock' 'birthday happi birthday' 'bore wan na' 'boy tell em'
 'brit wait day' 'britain got talent' 'btw happi mother' 'busi busi busi'
 'carter say video' 'carter video privat' 'celebr mother day'
 'cool look forward' 'count day till' 'coupl week ago' 'crochet bee poli'
 'day happi mother' 'day im work' 'day left school' 'day like today'
 'day love mom' 'day mom hope' 'day mom love' 'day mother day'
 'day mother world' 'day new job' 'day thought gon' 'day ur mom'
 'day wonder mom' 'didnt work today' 'doesnt feel like' 'doesnt look good'
 'doesnt look like' 'doesnt sound good' 'doesnt wan na' 'dont feel good'
 'dont feel like' 'dont know gon' 'dont know ill' 'dont know im'
 'dont know say' 'dont look like' 'dont realli wan' 'dont t

In [None]:
model_dataframe = pd.DataFrame(bow.toarray(),columns=bow_vector_names)

# Train Test Split for Bi-Gram

In [None]:
data.head()

Unnamed: 0,textID,text,selected_text,sentiment,cleaned_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,id respond go
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad miss san diego
2,088c60f138,my boss is bullying me...,bullying me,negative,boss bulli
3,9642c003ef,what interview! leave me alone,leave me alone,negative,interview leav alon
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,son couldnt put releas alreadi bought


In [None]:
y = data["sentiment"]

In [None]:
from sklearn.model_selection import train_test_split
x_train_bow, x_test_bow, y_train, y_test = train_test_split(model_dataframe, y
                                                          , random_state=42, test_size=0.3)

#Logistic Regression with Bi-Gram

In [None]:
from sklearn.linear_model import LogisticRegression
lgr = LogisticRegression()

In [None]:
lgr.fit(x_train_bow,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
lgr.score(x_train_bow,y_train)

0.45898315658140987

In [None]:
predictions = lgr.predict(x_test_bow)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report

In [None]:
confusion_matrix(y_test,predictions)

array([[  96, 2213,   29],
       [  59, 3256,   56],
       [  23, 2260,  253]])

In [None]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

    negative       0.54      0.04      0.08      2338
     neutral       0.42      0.97      0.59      3371
    positive       0.75      0.10      0.18      2536

    accuracy                           0.44      8245
   macro avg       0.57      0.37      0.28      8245
weighted avg       0.56      0.44      0.32      8245

