### 1. Importing Libraries

In [7]:
import re
import nltk
import joblib
import pickle
import pandas as pd
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

stemmer=PorterStemmer()

### 2.Defining Functions

In [2]:
def loading_tweets(file_name):
    df=pd.read_csv(file_name,header=None)
    df=df.rename({0:'id',1:'company',2:'sentiment',3:'raw_tweet'},axis=1)

    df=df[df['sentiment']!='Irrelevant']
    df=df[df['sentiment']!='Neutral']
    df=df.dropna()
    return df['raw_tweet'].values,df
def preprocessing_tweets(tweets):
    processed_tweets=[]
    for tweet in tqdm(tweets):
        tweet=re.sub(r'[^a-zA-Z]',' ',tweet)
        tweet=tweet.lower()
        tweet=[word for word in tweet.split(' ') if word not in stopwords.words('english')]
        tweet=[stemmer.stem(word) for word in tweet]
        tweet=[word for word in tweet if len(word)!=0]
        tweet=' '.join(tweet)
        processed_tweets.append(tweet)
    return tweets

### 3. Preparining Training Data

In [3]:
train_tweets,df_train=loading_tweets('tweets/twitter_training.csv')
train_tweets=preprocessing_tweets(train_tweets)

tfidf=TfidfVectorizer()
train_tweets=tfidf.fit_transform(train_tweets)

x_train=train_tweets.toarray()
y_train=pd.get_dummies(df_train['sentiment']).values[:,1:]

100%|████████████████████████████████████████████████████████████████████████████| 43013/43013 [07:13<00:00, 99.28it/s]


### 4. Preparing Testing Data

In [5]:
test_tweets,df_test=loading_tweets('tweets/twitter_validation.csv')
test_tweets=preprocessing_tweets(test_tweets)

test_tweets=tfidf.transform(test_tweets)

x_test=test_tweets.toarray()
y_test=pd.get_dummies(df_test['sentiment']).values[:,1:]

100%|████████████████████████████████████████████████████████████████████████████████| 543/543 [00:05<00:00, 92.89it/s]


### 5. Model Building / Training

In [6]:
mdl=MultinomialNB()

mdl.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


### 6. Model Evaluation 

In [15]:
print("Testing Accuracy : ",round(accuracy_score(mdl.predict(x_test),y_test)*100,3))
print("Training Accuracy : ",round(accuracy_score(mdl.predict(x_train),y_train)*100,3))

Testing Accuracy :  94.107
Training Accuracy :  91.607


### 7. Model Saving  & Encoder

In [17]:
joblib.dump(tfidf,'tfidf_model.joblib')     # TF-IDF Encoder
pickle.dump(mdl,open('model.mdl','wb'))     # Model

print('encoder is saved with name tfidf_model.joblib!')
print('encoder is saved with name model.mdl!')

encoder is saved with name tfidf_model.joblib!
encoder is saved with name model.mdl!
