In [1]:
# Importing all the required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import datetime
from datetime import datetime

import re
import string
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
from collections import Counter

from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [2]:
df=pd.read_csv('Nestle big boi.csv',encoding='ISO-8859-1')
df.head()

Unnamed: 0,Date,Text,Username,Like Count,Retweet Count,Follower Count
0,2010-12-30 16:00:48+00:00,Love the #Nestle tvc on ... Really awesome,Dominication,0.0,0.0,366.0
1,2010-12-30 10:20:53+00:00,RT @bmi_cfw: L'Oreal Continues To Work With #N...,BMIResearch,0.0,0.0,17221.0
2,2010-12-29 06:00:08+00:00,*HOT* #Nestle Nesquik $1.25 Off #Coupon http:/...,maritramos,0.0,1.0,7771.0
3,2010-12-28 22:01:24+00:00,Oatmeal Scotchies!!! Best cookies ever! http:/...,megansmunchies,0.0,0.0,1811.0
4,2010-12-28 18:51:03+00:00,RT @ameliatimbers U.S. #Muslims : A New #Consu...,AndresTTapia,0.0,1.0,3291.0


In [3]:
# Drop Username column
df.drop('Username',inplace=True,axis=1)

In [4]:
# Check any missing values
df.isnull().sum()

Date               30
Text               80
Like Count        160
Retweet Count     240
Follower Count    240
dtype: int64

In [5]:
for col in df.columns:
    print(col,':',(df[col].isnull().sum()/len(df))*100,'%')

Date : 0.017480887562931195 %
Text : 0.04661570016781652 %
Like Count : 0.09323140033563304 %
Retweet Count : 0.13984710050344956 %
Follower Count : 0.13984710050344956 %


In [6]:
# Drop the missing values as its count is very less compared to the entire dataset
df.dropna(inplace=True)
len(df)

171376

In [8]:
# extract hashtags from the Text column
hashtags = []
for text in df['Text']:
    hashtags += re.findall(r'#\w+', text)

# count the frequency of each hashtag
freq = Counter(hashtags)

# sort the hashtags by frequency
sorted_hashtags = sorted(freq.items(), key=lambda x: x[1], reverse=True)

# extract the top 20 hashtags
top_20_hashtags = sorted_hashtags[:20]

# print the top 20 hashtags with their frequency
for hashtag, count in top_20_hashtags:
    print(f"{hashtag}: {count}")

#Maggi: 17046
#Nestle: 16270
#maggi: 12125
#nestle: 10305
#boost: 8378
#smarties: 8357
#kitkat: 7519
#nespresso: 7143
#Nespresso: 6943
#coffee: 6938
#carnation: 6923
#NestleIndia: 6415
#nescafe: 6334
#NestlÃ: 5692
#Nescafe: 5520
#DolceGusto: 5421
#KitKat: 5171
#dolcegusto: 5100
#Carnation: 4725
#Smarties: 4301


In [9]:
# Remove URLs, mentions, and hashtags
def remove_twitter_elements(tweet):
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r'\@\w+|\#','', tweet)
    return tweet

df['Text'] = df['Text'].apply(remove_twitter_elements)

In [10]:
#removing emojis
def remove_emojis(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df['Text'] = df['Text'].apply(remove_emojis)

In [11]:
#removing punctuations
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

df['Text'] = df['Text'].apply(remove_punctuation)

In [12]:
# Remove special characters 
def remove_special_characters(tweet):
    tweet = re.sub('[^a-zA-Z0-9\s]', '', tweet)
    return tweet

df['Text'] = df['Text'].apply(remove_special_characters)

In [13]:
# converting to lowercase
df['Text'] = df['Text'].apply(lambda x: x.lower())

In [14]:
# Tokenize the text
def tokenize(tweet):
    tokens = nltk.word_tokenize(tweet)
    return tokens

df['Text'] = df['Text'].apply(tokenize)

In [15]:
# Remove stop words
def remove_stop_words(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return filtered_tokens

df['Text'] = df['Text'].apply(remove_stop_words)

In [16]:
# Stemming or Lemmatization
def stem_tokens(tokens):
    stemmer = SnowballStemmer('english')
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

df['Text'] = df['Text'].apply(stem_tokens)

In [17]:
# Normalize case
def normalize_case(tokens):
    normalized_tokens = [token.lower() for token in tokens]
    return normalized_tokens

df['Text'] = df['Text'].apply(normalize_case)

In [18]:
# join tokens back into a single string
def join_text(tweet):
    joined_tweet = " ".join(tweet)
    return joined_tweet

df['Text'] = df['Text'].apply(join_text)

In [19]:
# Convert date to extract Year, Month, Day
df['Date']=pd.to_datetime(df['Date'])

df['Year'] = df['Date'].apply(lambda date:date.year)
df['Month'] = df['Date'].apply(lambda date:date.month)
df['Day']=df['Date'].apply(lambda date:date.day)
df['Date'] = df['Date'].dt.date

In [20]:
df.head()

Unnamed: 0,Date,Text,Like Count,Retweet Count,Follower Count,Year,Month,Day
0,2010-12-30,love nestl tvc realli awesom,0.0,0.0,366.0,2010,12,30
1,2010-12-30,rt loreal continu work nestl time analyst pond...,0.0,0.0,17221.0,2010,12,30
2,2010-12-29,hot nestl nesquik 125 coupon printablecoupon c...,0.0,1.0,7771.0,2010,12,29
3,2010-12-28,oatmeal scotchi best cooki ever nestl,0.0,0.0,1811.0,2010,12,28
4,2010-12-28,rt us muslim new consum nich nestl increas hal...,0.0,1.0,3291.0,2010,12,28


In [21]:
# define a function to get the polarity of a tweet
def get_tweet_polarity(tweet):
    blob = TextBlob(tweet)
    polarity = blob.sentiment.polarity
    if polarity > 0.05:
        return 'positive'
    elif polarity < -0.05:
        return 'negative'
    else:
        return 'neutral'

# add a polarity column to the DataFrame
df['polarity'] = df['Text'].apply(get_tweet_polarity)


In [22]:
df.head()

Unnamed: 0,Date,Text,Like Count,Retweet Count,Follower Count,Year,Month,Day,polarity
0,2010-12-30,love nestl tvc realli awesom,0.0,0.0,366.0,2010,12,30,positive
1,2010-12-30,rt loreal continu work nestl time analyst pond...,0.0,0.0,17221.0,2010,12,30,neutral
2,2010-12-29,hot nestl nesquik 125 coupon printablecoupon c...,0.0,1.0,7771.0,2010,12,29,positive
3,2010-12-28,oatmeal scotchi best cooki ever nestl,0.0,0.0,1811.0,2010,12,28,positive
4,2010-12-28,rt us muslim new consum nich nestl increas hal...,0.0,1.0,3291.0,2010,12,28,positive


## Model Building

In [23]:
tfidf=TfidfVectorizer(max_features=3000)

In [24]:
X=tfidf.fit_transform(df['Text']).toarray()

In [25]:
X.shape

(171376, 3000)

In [26]:
# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(df['polarity'])

In [27]:
y

array([2, 1, 2, ..., 1, 1, 2])

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

In [29]:
gnb=GaussianNB()
mnb=MultinomialNB()
bnb=BernoulliNB()

In [30]:
#Gaussian Naive Bayes
gnb.fit(X_train,y_train)
y_pred1=gnb.predict(X_test)
print('Accuracy score of Gaussian NB:',accuracy_score(y_test,y_pred1))
print('Confusion matrix of Gaussian NB:')
print(confusion_matrix(y_test,y_pred1))

Accuracy score of Gaussian NB: 0.3936865445209476
Confusion matrix of Gaussian NB:
[[ 2720    50   127]
 [12742  4351  2150]
 [ 5466   247  6423]]


In [31]:
#Multinomial Naive Bayes
mnb.fit(X_train,y_train)
y_pred2=mnb.predict(X_test)
print('Accuracy score of Multinomial NB:',accuracy_score(y_test,y_pred2))
print('Confusion matrix of Multinomial NB:')
print(confusion_matrix(y_test,y_pred2))

Accuracy score of Multinomial NB: 0.8714260707200373
Confusion matrix of Multinomial NB:
[[  903  1578   416]
 [   40 18413   790]
 [   13  1570 10553]]


In [32]:
#Bernoulli Naive Bayes
bnb.fit(X_train,y_train)
y_pred3=bnb.predict(X_test)
print('Accuracy score of Bernoulli NB:',accuracy_score(y_test,y_pred3))
print('Confusion matrix of Bernoulli NB:')
print(confusion_matrix(y_test,y_pred3))

Accuracy score of Bernoulli NB: 0.8856050881082973
Confusion matrix of Bernoulli NB:
[[ 1783   750   364]
 [  468 17174  1601]
 [  139   599 11398]]


In [35]:
df.to_csv('Nestle Preprocessed.csv')