# Project Requirements

- **Imports**

In [21]:
import re
import nltk
import numpy as np
import pandas as pd 
from IPython.display import clear_output
from warnings import filterwarnings
filterwarnings("ignore")

- **Necessary Installs & downloads**

In [22]:
nltk.download('stopwords')
clear_output()

# Read Data

In [23]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Tweet,rate
0,0,#FollowFriday @France_Inte @PKuchly57 @Milipol...,1
1,1,@Lamb2ja Hey James! How odd :/ Please call our...,1
2,2,@DespiteOfficial we had a listen last night :)...,1
3,3,@97sides CONGRATS :),1
4,4,yeaaaah yippppy!!! my accnt verified rqst has...,1


# Data Preprocessing

### 1) Text processing:

- **Text Cleaning**
    - Use `re` package to remove hyperlinks, twitter marks and styles

In [24]:
def clean(tweet):
    tweet=re.sub('(#|@)\w*',"",tweet)# \w [a-z|A_Z|0-9|_]
    tweet=re.sub("https?:\/\/\S+","",tweet)
    tweet=re.sub("(\?|!)+"," ",tweet)
    tweet=re.sub("\s\d+\s","",tweet)
    tweet=re.sub("(\.|\,)+","",tweet)
    tweet=re.sub("^\s+","",tweet)
    tweet=re.sub("\s+$","",tweet)
    return tweet

- **Tokenization**

In [25]:
def tokenize(tweet):
    return tweet.split()

- **Lowercase words**

In [26]:
def lowercase(tweet):
    return [token.lower() for token in tweet]

- **Remove Stop words**

In [27]:
from nltk.corpus import stopwords
stop_words=stopwords.words('english')

In [28]:
def rmv_stop_words(tweet):
    return [token for token in tweet if token not in stop_words]

- **Stemming**

In [29]:
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()

In [30]:
def stemming(tweet):
    return [stemmer.stem(token) for token in tweet]

- **All of above in one method**

In [31]:
def text_processing(tweets):
    return np.array([stemming(rmv_stop_words(lowercase(tokenize(clean(tweet))))) for tweet in tweets])

- **Apply Text preprocessing**

In [32]:
tweets = text_processing(df.Tweet)
rates = df.rate

### B) Feature Extraction (BOW & Tf-iDF)

In [33]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [17]:
# you must change the tweets back into sentence to apply bag of words
corpus = [' '.join(tweet) for tweet in tweets]

In [53]:
vectorizer = CountVectorizer()
# vectorizer = TfidfVectorizer() # uncomment to use Tf-iDF instead of Bag of Words

new_tweets = vectorizer.fit_transform(corpus).toarray()

# feature_names = vectorizer.get_feature_names_out()

# Data Split

In [54]:
from sklearn.model_selection import train_test_split
X, y = new_tweets, rates
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.1, random_state=42)

# Logistic Reg Model

**Training**

In [55]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

**Evaluation**

In [56]:
from sklearn.metrics import accuracy_score

In [57]:
train_pred = lr_model.predict(X_train)
val_pred = lr_model.predict(X_val)
test_pred = lr_model.predict(X_test)

tr_acc = round(accuracy_score(train_pred, y_train), 4)
val_acc = round(accuracy_score(val_pred, y_val), 4)
ts_acc = round(accuracy_score(test_pred, y_test), 4)

In [58]:
print(f"Training Accuracy ==> {tr_acc*100}%")
print(f"Validation Accuracy ==> {val_acc*100}%")

Training Accuracy ==> 86.88%
Validation Accuracy ==> 73.88%


# Naive Bayes ML Model

In [59]:
from sklearn.naive_bayes import GaussianNB

**Training the model**

In [60]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

**Evaluating the model**

In [61]:
from sklearn.metrics import accuracy_score 
train_pred = nb_model.predict(X_train)
val_pred = nb_model.predict(X_val)

tr_acc = round(accuracy_score(y_train, train_pred), 4)
val_acc = round(accuracy_score(y_val, val_pred), 4)

In [62]:
print(f"{tr_acc * 100}%")
print(f"{val_acc * 100}%")

88.18%
61.5%


# The final Test

In [63]:
test_pred = lr_model.predict(X_test)
ts_acc = round(accuracy_score(y_test, test_pred), 4)
print(f"{ts_acc * 100}%")

73.5%
