In [28]:
import re
import nltk
import joblib
import pickle
import pandas as pd
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

stemmer = PorterStemmer()

In [2]:
df = pd.read_csv('twitter_training.csv', header=None)
df = df.rename({0 : 'id', 1 : 'company', 2 : 'sentiment', 3 : 'tweet'}, axis = 1)

In [3]:
df.head()

Unnamed: 0,id,company,sentiment,tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
df.isnull().sum()

id             0
company        0
sentiment      0
tweet        686
dtype: int64

In [5]:
df= df.dropna()

In [6]:
df['sentiment'].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [7]:
df = df[df['sentiment'] != 'Irrelevant']
df = df[df['sentiment'] != 'Neutral']

In [8]:
df['sentiment'].unique()

array(['Positive', 'Negative'], dtype=object)

In [9]:
df_train = df
train_tweets = df_train['tweet'].values

In [10]:
def loading_tweets(file_name):
    df = pd.read_csv(file_name, header = None)
    df = df.rename({0 : 'id', 1 : 'company', 2 : 'sentiment', 3 : 'raw_tweet'}, axis = 1)
    df = df[df['sentiment'] != 'Irrelevant']
    df = df[df['sentiment'] != 'Neutral']
    df = df.dropna()
    return df['raw_tweet'].values, df

## Preprocessing
Removing special chracters, stop words and applying stemming

In [11]:
def preprocessing_tweets(tweets): 
    processed_tweets = []
    for tweet in tqdm(tweets):
        tweet = re.sub(r'[^a-zA-Z]',' ',tweet)    
        tweet = tweet.lower()
        tweet = [word for word in tweet.split(' ') if not word in stopwords.words('english')]
        tweet = [stemmer.stem(word) for word in tweet]
        tweet = [word for word in tweet if len(word) != 0]
        tweet = ' '.join(tweet)
        processed_tweets.append(tweet)
    return tweets

In [12]:
train_tweets = preprocessing_tweets(train_tweets)

100%|██████████| 43013/43013 [02:35<00:00, 277.08it/s]


## Prepairing Training Data

In [13]:
tfidf = TfidfVectorizer()
train_tweets = tfidf.fit_transform(train_tweets)

x_train = train_tweets.toarray()
y_train = pd.get_dummies(df_train['sentiment']).values[:,1:]

## Prepairing Testing Data

In [14]:
test_tweets, df_test  = loading_tweets('twitter_validation.csv')
test_tweets            = preprocessing_tweets(test_tweets)

test_tweets = tfidf.transform(test_tweets)

x_test = test_tweets.toarray()
y_test = pd.get_dummies(df_test['sentiment']).values[:,1:]

100%|██████████| 543/543 [00:02<00:00, 244.05it/s]


 ## Model Building / Training

In [15]:
mdl = MultinomialNB()

mdl.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


## Model Evaluation 

In [16]:
print('Training Accuracy :', round(accuracy_score(mdl.predict(x_train), y_train)*100,3))
print('Testing Accuracy  :', round(accuracy_score(mdl.predict(x_test), y_test)*100,3))

Training Accuracy : 91.607
Testing Accuracy  : 94.107


## Including Neutral Sentiment

In [17]:
def loading_tweets(file_name):
    df = pd.read_csv(file_name, header=None)
    df = df.rename({0: 'id', 1: 'company', 2: 'sentiment', 3: 'raw_tweet'}, axis=1)
    df = df[df['sentiment'] != 'Irrelevant']
    df = df.dropna()

    # Create a new column for encoded sentiment
    sentiment_mapping = {'Neutral': 0, 'Positive': 1, 'Negative': 2}
    df['sentiment_encoded'] = df['sentiment'].map(sentiment_mapping)

    return df['raw_tweet'].values, df['sentiment_encoded'].values, df

def preprocessing_tweets(tweets):
    processed_tweets = []
    for tweet in tqdm(tweets):
        tweet = re.sub(r'[^a-zA-Z]', ' ', tweet)
        tweet = tweet.lower()
        tweet = [word for word in tweet.split(' ') if not word in stopwords.words('english')]
        tweet = [stemmer.stem(word) for word in tweet]
        tweet = [word for word in tweet if len(word) != 0]
        tweet = ' '.join(tweet)
        processed_tweets.append(tweet)
    return processed_tweets


In [18]:
train_tweets, y_train, df_train = loading_tweets('twitter_training.csv')
train_tweets = preprocessing_tweets(train_tweets)

tfidf = TfidfVectorizer()
train_tweets = tfidf.fit_transform(train_tweets)

x_train = train_tweets.toarray()

test_tweets, y_test, df_test = loading_tweets('twitter_validation.csv')
test_tweets = preprocessing_tweets(test_tweets)

test_tweets = tfidf.transform(test_tweets)

x_test = test_tweets.toarray()


100%|██████████| 61121/61121 [03:53<00:00, 261.96it/s]
100%|██████████| 828/828 [00:04<00:00, 199.66it/s]


In [19]:
mdl = MultinomialNB()

mdl.fit(x_train, y_train)

print('Training Accuracy :', round(accuracy_score(mdl.predict(x_train), y_train) * 100, 3))
print('Testing Accuracy  :', round(accuracy_score(mdl.predict(x_test), y_test) * 100, 3))

Training Accuracy : 81.944
Testing Accuracy  : 83.092
