# Business Problem
### This project determines which tweets can be correctly identified to contain  either positive sentiments (“Positive emotion”) or negative sentiments (“Negative emotion”) using a binary classifier. 


In [None]:
# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score

import matplotlib.patches as mpatches


import string

from nltk.corpus import stopwords
stop = stopwords.words('english')

from helper import model_helper, word_count_by_class, tweet_finder_by_word

# 1. Obtain

In [None]:
# load in the dataset
df = pd.read_csv("data/emoting_tweets.csv", encoding_errors="ignore")
df.head()

# 2. Scrub + Explore

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.i

In [None]:
# rename target column
df["target"] = df["is_there_an_emotion_directed_at_a_brand_or_product"]

In [None]:
# drop former target column to clean up dataset
df.drop(columns=["is_there_an_emotion_directed_at_a_brand_or_product"], inplace=True)

In [None]:
# view target column distribution 
df["target"].value_counts(normalize=True)

In [None]:
# drop two unnecessary target variables
df = df[(df["target"] != "No emotion toward brand or product") & (df["target"] != "I can't tell")]

In [None]:
# view target column distribution
df["target"].value_counts(normalize=True)

In [None]:
# View distribution of the target values
with plt.style.context('fivethirtyeight'):
    df['target'].value_counts().plot(kind='bar', color = "c")

    # giving title to the plot
    plt.title("Distribution of tweet sentiment")

    # giving X and Y labels
    plt.xlabel("tweets")
    plt.xticks(rotation = 0)
    plt.ylabel("# of tweets")

    plt.show()

In [None]:
# view target column counts
df["target"].value_counts()

In [None]:
# change "Positive emotion" to equal 1 and "Negative emotion" to equal 0
df["target"] = np.where(df['target'] == "Positive emotion", 1, 0)

In [None]:
# confirm change
df["target"].value_counts()

In [None]:
# set text to lowercase
df['tweet_lowercase'] = df['tweet_text'].map(lambda x: str(x).lower())
df.head()

In [None]:
# remove punctuation
no_punctuation = []
for s in df['tweet_lowercase']:
    for p in string.punctuation:
        s = s.replace(p, '')
    no_punctuation.append(s)

In [None]:
# add no_punction column to the dataframe
df['no_punctuation'] = no_punctuation

In [None]:
# view dataframe
df.head()

In [None]:
# remove stop words
df['tweets_without_stopwords'] = df['no_punctuation'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
# view dataframe
df.head()

In [None]:
# view count of most common words within tweets_without_stopwords
df['tweets_without_stopwords'].str.split(expand=True).stack().value_counts()

In [None]:
# graph of top 10 words of positive tweets
with plt.style.context('fivethirtyeight'):
    pd.DataFrame(df[df['target'] == 1]['tweets_without_stopwords'].str.split(
            expand=True).stack().value_counts().head(11)).drop("2").plot(kind="barh", color="c", legend=False)
    plt.gca().invert_yaxis()
    plt.xlabel("# of tweets")
    plt.title("Most common words in positive tweets")

In [None]:
# graph of top 10 words of negative tweets
with plt.style.context('fivethirtyeight'):
    pd.DataFrame(df[df['target'] == 0]['tweets_without_stopwords'].str.split(
            expand=True).stack().value_counts().head(11)).drop("2").plot(kind="barh", color="c", legend=False)
    plt.gca().invert_yaxis()
    plt.xlabel("# of tweets")
    plt.title("Most common words in negative tweets")

# 4. Model

#### Train test split

In [None]:
# train test split 
X = df['tweets_without_stopwords']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

## Bag of words model iteration
### Use logistic regression, decision tree classifier, and random forest classifier

In [None]:
lr_basic = model_helper(X, y, LogisticRegression(), bow=True)

In [None]:
dt_basic = model_helper(X, y, DecisionTreeClassifier(), bow=True)

In [None]:
dt_70 = model_helper(X, y, DecisionTreeClassifier(max_depth=70), bow=True)

### * Best model score 🎉

In [None]:
rf = model_helper(X, y, RandomForestClassifier(random_state=11), bow=True)

In [None]:
rf_50 = model_helper(X, y, RandomForestClassifier(max_depth=50, random_state=11), bow=True)

## Bag of words with class_weight="balanced" model iteration
### Use logistic regression, decision tree classifier, and random forest classifier

In [None]:
lr_cw = model_helper(X, y, LogisticRegression(class_weight="balanced"), bow=True)

In [None]:
dt_cw = model_helper(X, y, DecisionTreeClassifier(class_weight="balanced"), bow=True)

In [None]:
rf_cw = model_helper(X, y, RandomForestClassifier(class_weight="balanced", random_state=11), bow=True)

## Tfidf model iteration
### Use logistic regression, decision tree classifier, and random forest classifier

In [None]:
lr_tfidf = model_helper(X, y, LogisticRegression(), bow=False)

In [None]:
dt_tfidf = model_helper(X, y, DecisionTreeClassifier(), bow=False)

In [None]:
rf_tfidf = model_helper(X, y, RandomForestClassifier(random_state=11), bow=False)

## Tfidf and class_weight="balanced" model iteration
### Use logistic regression, decision tree classifier, and random forest classifier

In [None]:
lr_tfidf = model_helper(X, y, LogisticRegression(class_weight="balanced"), bow=False)

In [None]:
dt_tfidf = model_helper(X, y, DecisionTreeClassifier(class_weight="balanced"), bow=False)

In [None]:
rf_tfidf = model_helper(X, y, RandomForestClassifier(class_weight="balanced", random_state=11), bow=False)

# 5. Interpret

In [None]:
# Make confusion matrix
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [None]:
# confusion matrix with best model, random forest
plot_confusion_matrix(rf, X_test_bow, y_test)

In [None]:
# make feature importance plot
feat_importance = pd.DataFrame({"word": vectorizer.get_feature_names_out(), 
                               "importance": rf.feature_importances_})

In [None]:
# view feature importance
fi = feat_importance.sort_values(by="importance", ascending=False).head(10)
fi

In [None]:
# graph of most important words with the above feature importance plot
with plt.style.context('fivethirtyeight'):
    plt.barh(y=fi["word"], width=fi["importance"], color=['#5cb85c', '#d9534f','#5cb85c','#d9534f', '#d9534f',
                                                          '#d9534f', '#5cb85c', '#5cb85c', '#d9534f',  '#5cb85c' ])
    plt.gca().invert_yaxis()
    
    red_patch = mpatches.Patch(color= '#d9534f', label='Negative tweets')
    green_patch = mpatches.Patch(color= '#5cb85c', label='Positive tweets')

    plt.legend(handles=[red_patch, green_patch], bbox_to_anchor=(1, 0.5))
    plt.xlabel('Feature Importance')
    plt.title('Most Important Words');

In [None]:
# look at target count by keyword function
word_count_by_class("rt", df)

In [None]:
# find tweets by keyword function
tweet_finder_by_word("fail", df)

# Conclusions and Future Work

### Using a machine learning, we can predict whether a tweet is positive. Negative tweets have strongly negative words while positive tweets have less distinctive words that indicate positivity. For future work, I’d look at the labels of the tweets. There are some tweets that appeared to be incorrectly labeled within the dataset.
