# Clasifying Tweet sentiments


© Explore Data Science Academy

In [1]:
# import relevant libraries
import nltk
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
pd.set_option('display.max_rows', 100)

from nltk.corpus import stopwords
from sklearn.metrics import classification_report

# set plot style
sns.set()

In [2]:
# Loading Data
df = pd.read_csv('C:/Users/Mpilenhle/Documents/EDSA/Classification/Advanced_Classification_Predict-student_data-2780/train.csv')
df_test = pd.read_csv('C:/Users/Mpilenhle/Documents/EDSA/Classification/Advanced_Classification_Predict-student_data-2780/test_with_no_labels.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/Mpilenhle/Documents/EDSA/Classification/Advanced_Classification_Predict-student_data-2780/train.csv'

In [None]:
# looking at the data
df.head(15)

In [None]:
df[df['sentiment'] == -1]

In [None]:
df.shape

In [None]:
df_test.head()

In [None]:
df_test.shape

In [None]:
#looking at the different types of labels
type_labels = list(df.sentiment.unique())
print(type_labels)

In [None]:
# visualising the data distribution
df['sentiment'].value_counts().plot(kind = 'bar')
plt.show()

# Removing Noise from the data

In [None]:
#remmoving the urls
pattern_url = r'http[s]?://t.co/[A-Za-z0-9]+'
subs_url = r'url-web'
df['message'] = df['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [None]:
v = df['message']
c = [i for i in v]
for i in range(50):
    print(str(i) +')' + c[i] +'\n')

In [None]:
#remmoving the Retweets
pattern_url = r'RT\s\@[A-Za-z0-9_]+:'
subs_url = r''
df['message'] = df['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [None]:
#remmoving the mentions 
pattern_url = r'@[A-Za-z0-9_]+'
subs_url = r''
df['message'] = df['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [None]:
#remmoving the Hashtags 
pattern_url = r'\#[A-Za-z0-9#?_]+'
subs_url = r''
df['message'] = df['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [None]:
#remmoving the remaining https
pattern_url = r'https:[.*?]+'
subs_url = r''
df['message'] = df['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [None]:
# turning all tweets to lower case
df['message'] = df['message'].str.lower()

In [None]:
# punctuation remover function
import string
def remove_punctuation(tweets):
    return ''.join([l for l in tweets if l not in string.punctuation])

In [None]:
# using apply method to remove the punctuation marks
df['message'] = df['message'].apply(remove_punctuation)

In [None]:
#creating a function for removing emojis
import re
def remove_emoji(string):
    emoji_pattern = re.compile("[" 
                u"\U0001F600-\U0001F64F"  # emoticons
                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                u"\U00002702-\U000027B0"
                u"\U000024C2-\U0001F251"
                u"\U0001f926-\U0001f937"
                u'\U00010000-\U0010ffff'
                u"\u200d"
                u"\u2640-\u2642"
                u"\u2600-\u2B55"
                u"\u23cf"
                u"\u23e9"
                u"\u231a"
                u"\u3030"
                u"\ufe0f"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [None]:
# Removing the emojis using the apply method
df['message'] = df['message'].apply(remove_emoji)

# Tokenizing the tweets

In [None]:
# importing tokenizing library
from nltk.tokenize import word_tokenize, TreebankWordTokenizer

In [None]:
tokeniser = TreebankWordTokenizer()
df['tokens'] = df['message'].apply(tokeniser.tokenize)

In [None]:
df.head()

# Lemmatizing the tweets

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [None]:
# lemmatizing function
def tweet_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words if word.isalpha()]    

In [None]:
df['lemma'] = df['tokens'].apply(tweet_lemma, args=(lemmatizer, ))

In [None]:
df.head()

# Stemming the Lemma Tokens

In [None]:
#importing stemmer library
from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer

In [None]:
#Stemmer function
def token_stemmer(words, stemmer):
    return [stemmer.stem(word) for word in words]

In [None]:
# find the stem of each word in the original tokens
stemmer = SnowballStemmer('english')
df['original_stem'] = df['tokens'].apply(token_stemmer, args=(stemmer, ))

In [None]:
# find the stem of each word in the Lemma tokens
stemmer = SnowballStemmer('english')
df['lemma_stem'] = df['lemma'].apply(token_stemmer, args=(stemmer, ))

In [None]:
df.lemma_stem.head()

# Removing the stop words

In [None]:
def remove_stop_words(tokens):    
    return [t for t in tokens if t not in stopwords.words('english')]

In [None]:
df['lemma_no_stop_words'] = df['lemma_stem'].apply(remove_stop_words)

In [None]:
df['original_no_stop_words'] = df['lemma_stem'].apply(remove_stop_words)

In [None]:
df.head()

# Creating Ngrams

In [None]:
'''
The function creates a new data frame with 2 ngrams to test
returns a function with a new column of ngrams

'''
from nltk.util import ngrams
def ngramconvert(df,n=2):
    for item in df.columns:
        df['new_'+item]=df[item].apply(lambda sentence: list(ngrams(sentence.split(), n)))
    return df

In [None]:
ngram_df = ngramconvert(df[['message']],2)
ngram_df.head()

# Creating a count vector with 2 ngrams not scaled


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [None]:
# splitting the data
X = df['message']
y = df['sentiment']

In [None]:
# splitting the data
X_train1, X_test1, y_train, y_test = train_test_split(X, y,test_size=0.2,
random_state=53)

In [None]:
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
ngram_count_train1 = count_vectorizer.fit_transform(X_train1.values)
ngram_count_test1 = count_vectorizer.transform(X_test1.values)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

nb_classifier = MultinomialNB()
nb_classifier.fit(ngram_count_train1, y_train)

ngram_pred = nb_classifier.predict(ngram_count_test1)
metrics.accuracy_score(y_test, ngram_pred)

In [None]:
print('2 ngram  Model')
print(classification_report(y_test, ngram_pred, target_names= ['-1', '0', '1', '2']))

In [None]:
#traing the model with the entire data
yf = df['sentiment']
Xf = df['message']

In [None]:
X_test = df_test['message']

In [None]:
count_vectorizerf = CountVectorizer(ngram_range=(1, 2))
count_finalf = count_vectorizerf.fit_transform(Xf)
count_final_test = count_vectorizerf.transform(X_test)

In [None]:
# Import the scaler module
from sklearn import preprocessing

fscaler = preprocessing.MaxAbsScaler()
f_scaled = fscaler.fit_transform(count_finalf)
f_scaled_test = fscaler.transform(count_final_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

final_classifier2 = MultinomialNB()
final_classifier2.fit(f_scaled, yf)


In [None]:
final_preds = final_classifier.predict(count_final_test)

In [None]:
final_preds = final_classifier.predict(count_final_test)
daf=pd.DataFrame(final_preds, columns=['sentiment'])
daf.head()

output = pd.DataFrame({"tweetid":df_test['tweetid']})
submission3 = output.join(daf)        
submission3.to_csv("submission3.csv", index=False)

In [None]:
output = pd.DataFrame({"tweetid":df_test['tweetid']})
submission3 = output.join(daf)        
submission3.to_csv("submission3.csv", index=False)

# using  The cleaned column



In [None]:
clean_sentences = [" ".join(i) for i in df['lemma_no_stop_words']]
df['clean_sentences'] = clean_sentences
df.head()

In [None]:
x_2 = df['clean_sentences']
y_2 = df['sentiment']

In [None]:
x_2.head()

In [None]:
# splitting the data
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(x_2, y_2,test_size=0.2,
random_state=53)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

vectorizer = TfidfVectorizer()
vectorised_train_documents = vectorizer.fit_transform(X_train_2)
vectorised_test_documents = vectorizer.transform(X_test_2)

In [None]:
from yellowbrick.text import FreqDistVisualizer
features = vectorizer.get_feature_names()
visualizer = FreqDistVisualizer(features=features, orient='v', n = 25)
visualizer.fit(vectorised_train_documents)
visualizer.show()

In [None]:
from yellowbrick.text import UMAPVisualizer

umap = UMAPVisualizer(metric="cosine")
umap.fit(vectorised_train_documents)
umap.show()

In [None]:
from yellowbrick.regressor import ResidualsPlot
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1
)

visualizer = ResidualsPlot(LinearRegression())
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

nb_classifier = MultinomialNB()
nb_classifier.fit(vectorised_train_documents, y_train_2)

bag_pred = nb_classifier.predict(vectorised_test_documents)
metrics.accuracy_score(y_test_2, bag_pred)

In [None]:
print('bag of words')
print(classification_report(y_test_2, bag_pred, target_names= ['-1', '0', '1', '2']))

In [None]:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier


In [None]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(vectorised_train_documents, y_train_2)
clf.score(vectorised_test_documents, y_test_2)

In [None]:
b_pred = clf.predict(vectorised_test_documents)

In [None]:
print(classification_report(y_test_2, b_pred, target_names= ['-1', '0', '1', '2']))

In [None]:
count_fnal = 

final_pred = clf.predict(count_final_test)
daf=pd.DataFrame(final_pred, columns=['sentiment'])
daf.head()

output = pd.DataFrame({"tweetid":df_test['tweetid']})
submission4 = output.join(daf)        
submission4.to_csv("submission4.csv", index=False)

In [None]:
# Import the scaler module
from sklearn import preprocessing

fscaler = preprocessing.MaxAbsScaler()
scaled_train = fscaler.fit_transform(vectorised_train_documents)
scaled_test = fscaler.transform(vectorised_test_documents)

In [None]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(vectorised_train_documents, y_train_2)
clf.score(scaled_test, y_test_2)

In [None]:
s_pred = clf.predict(scaled_test)

In [None]:
print(classification_report(y_test_2, s_pred, target_names= ['-1', '0', '1', '2']))

In [None]:
vectorised_train_documents
vectorised_test_documents