In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

In [None]:
ds = pd.read_csv("C:/Users/jenis/Desktop/DDSC_Sentiment Analysis/Twitter/Tweets.csv")

In [None]:
ds.head()

In [None]:
#checking for null values and dropping if any present
ds.isnull().values.any()
ds = ds.dropna()

In [None]:
ds.info()

In [None]:
#check for duplicate values
##here we have used (.sum) to know that total duplicate value is 0
ds.duplicated().sum()

In [None]:
#Categorize the data in a new column: "tag_id"
## for eg, 0 : neutral, 1:positive

ds['tag_id'] = ds['sentiment']
for i in ds.index:
    if ds['sentiment'][i] == "neutral":
        ds['tag_id'][i] = 0
    elif ds['sentiment'][i] == "positive":
        ds['tag_id'][i] = 1
    elif ds['sentiment'][i] == "negative":
        ds['tag_id'][i] = 2

In [None]:
ds.head()

In [None]:
# Percentage distribution of Sentiment in the Data
count = ds['sentiment'].value_counts()
total_count = len(ds)
neutral_count = count[0]
positive_count = count[1]
negative_count = count[2]
neutral_percentage = (round((neutral_count/total_count)*100))
positive_percentage = (round((positive_count/total_count)*100))
negative_percentage = (round((negative_count/total_count)*100))

In [None]:
neutral_percentage

In [None]:
positive_percentage

In [None]:
negative_percentage

In [None]:
pip install wordcloud

In [None]:
# Word cloud
a=ds['text']
plt.axis('off')
wordcloud = WordCloud(background_color="white").generate(str(a))
plt.imshow(wordcloud)

In [None]:
# Convert the tweet to lower case/upper case 
tweet_text=ds['text']
print(tweet_text)
type(tweet_text)

## Upper case
tweet_text.str.upper()

##Lower case
tweet_text.str.lower()

In [None]:
sw = set(stopwords.words("english"))
neg_sw = ["no", "nor", "not"]
pos_sw = [i for i in sw if i not in neg_sw]

In [None]:
import string
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [None]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    K = []
    for i in text:
        if i.isalnum():
            K.append(i)
    
    text = K[:]
    K.clear()
    
    for i in text:
        if i not in pos_sw and i not in string.punctuation:
            K.append(i)
            
    text = K[:]
    K.clear()
    
    for i in text:
        K.append(ps.stem(i))
        
    return " ".join(K)

In [None]:
ds['trans_text'] = ds['text'].apply(transform_text)

In [None]:
ds.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 8000, ngram_range=(1,2))
X = cv.fit_transform(ds['trans_text']).toarray()
y = ds['sentiment']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [None]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, multilabel_confusion_matrix,recall_score

In [None]:
mnb = MultinomialNB()
bnb = BernoulliNB()
lr = LogisticRegression(max_iter=300, multi_class="multinomial")
rfc = RandomForestClassifier(n_estimators=50, random_state=2, max_depth=25)
dtc = DecisionTreeClassifier(max_depth=30)

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    accuracy=accuracy_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred,average="micro")
    
    return accuracy,precision

In [None]:
clfs = {
    'Multinomial Naive Bayes': mnb, 
    'Bernoulli Naive Bayes': bnb, 
    'Logistic Regression': lr, 
    'Random Forest Clasifier': rfc,
    'Decision Tree Clasifier': dtc 
}

In [None]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

In [None]:
performance=pd.DataFrame({"Algorithm":clfs.keys(),"Accuracy":accuracy_scores,"Precision":precision_scores}).sort_values("Precision",ascending=False,ignore_index=True)

In [None]:
performance