In [None]:
!cat Tweets.csv

In [None]:
import pandas as pd
import sklearn
from matplotlib import pyplot as plt

In [None]:
# Complete
tweets = pd.read_csv('Tweets.csv')
cl_tw = tweets.dropna(subset=['text','airline_sentiment'])
cl_tw.head()

In [None]:
cl_tw['airline_sentiment'].value_counts().plot(kind="bar")

In [None]:
airlines = []
sentiments = []

for airline in cl_tw['airline'].unique():
  airlines.append(airline)
  sentiments.append(list(cl_tw[cl_tw['airline'] == airline]['airline_sentiment']))

plt.hist(sentiments, label=airlines)
plt.legend()
plt.show()

In [None]:
cl_tw['tweet_length'] = cl_tw['text'].str.len()
plt.hist(cl_tw['tweet_length'])

In [None]:
# import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords
# stop_words = stopwords.words('english')

# cl_tw['text_without_stopwords'] = cl_tw['text_lowered'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# nltk.download('wordnet')
# from nltk.stem import WordNetLemmatizer
# lemmatizer = WordNetLemmatizer()

# cl_tw['text_lemmatized'] = cl_tw['text_without_stopwords'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
# cl_tw.head()

TASK 2: Pre-process the input texts, i.e., tweets, for classification. You can use external libraries like nltk for this task.

Bring the texts to lower case.
Remove stop words from the lower-cased text.
Perform lemmatization on the lower-cased text without stop words.

In [None]:
# TASK 2: Pre-process the input texts, i.e., tweets, for classification. You can use external libraries like nltk for this task.

# Bring the texts to lower case.
# Remove stop words from the lower-cased text.
# Perform lemmatization on the lower-cased text without stop words.

# Complete

cl_tw['text_lowered'] = cl_tw['text'].str.lower()

In [None]:
def remove_stopwords(text, stopwords):
    return ' '.join([word for word in text.split() if word not in (stopwords)])


# Complete

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
cl_tw['text_lowered_without_stopwords'] = cl_tw['text_lowered'].apply(lambda x: remove_stopwords(x, stop_words))

In [None]:
def lemmatize(text, lemmatizer):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])


# Complete
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
cl_tw['text_lowered_without_stopwords_lemmatized'] = cl_tw['text_lowered_without_stopwords'].apply(lambda x: lemmatize(x, lemmatizer))

In [None]:
cl_tw.head()

**TASK 3:** 
1. Obtain TF-IDF features for the pre-processed input texts. You are encouraged to use the `scikit-learn` for this.
2. Split the data into train and test.

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer  # hint

np.random.seed(42)  # this is to make sure you get reproducable results

def tf_idf(texts):
    # Complete
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts)
    return X.toarray(), vectorizer

X, vectorizer = tf_idf(cl_tw['text_lowered_without_stopwords_lemmatized'])
y = cl_tw['airline_sentiment']

from sklearn.model_selection import train_test_split

# Complete
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

**TASK 4:**
1. Train a 2-layer perceptron with one hidden layer of the size 30 on your training data. You are encouraged to use the `scikit-learn` library for this task. You can use the default hyperparameters set [here](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html).

2. Test your trained MLP on the test data and report accuracy, f1_score and confusion matrix of the predictions.

Food for thought: Which hyperparameter values can improve your model? Hint: Look at the guidelines at the end of this notebook.

In [None]:
from sklearn.neural_network import MLPClassifier as MLP # hint
# Complete, one hidden layer of size 30
clf = MLP(hidden_layer_sizes=(30), max_iter=1000)
clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Complete
y_pred = clf.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred, average='macro'))
print("Recall: ", recall_score(y_test, y_pred, average='macro'))
print("F1: ", f1_score(y_test, y_pred, average='macro'))

from sklearn.metrics import confusion_matrix

# Complete
confusion_matrix(y_test, y_pred)

from sklearn.metrics import classification_report

# Complete
print(classification_report(y_test, y_pred))

**TASK 5:** From task 1, you might have noticed that most of the tweets start with "@{airline_username}", e.g., "@VirginAmerica". In this task, you will test if your trained model has been biased with respect to the airline name or not.

1. What fraction of the tweets start with "@{airline_username}"?
2. Remove "@{airline_username}" from all the texts that start with this pattern.
3. Re-apply the pre-processings from TASK 2 and re-fit your TF-IDF feature on the new texts.
4. Re-train your MLP using the TF-IDF features from step 3. Make sure to use the same train/test split.
5. Test your new MLP and report accuracy, f1_score, confusion matrix. Are the results different from task 4? How do you interpret your observations?

In [None]:
def remove_airline_usernames(text, airline_usernames):
    return ' '.join([word for word in text.split() if word not in (airline_usernames)])


# Complete
airline_usernames = set()
cl_tw['text_without_airline_username'] = cl_tw['text_lowered_without_stopwords_lemmatized'].apply(lambda x: remove_airline_usernames(x, airline_usernames))

In [None]:
#Complete
X, vectorizer = tf_idf(cl_tw['text_without_airline_username'])
y = cl_tw['airline_sentiment']

In [None]:
#Complete
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Complete
clf = MLP(hidden_layer_sizes=(30), max_iter=1000)
clf.fit(X_train, y_train)

#Complete
y_pred = clf.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred, average='macro'))
print("Recall: ", recall_score(y_test, y_pred, average='macro'))
print("F1: ", f1_score(y_test, y_pred, average='macro'))

#Complete
confusion_matrix(y_test, y_pred)

#Complete
print(classification_report(y_test, y_pred))

**TASK 6**: In this task, you will analyze how your MLP features/neurons are behaving with respect to class prediction. To this end, you will examine which neurons in your MLP model have the largest weights when predicting classes.

1. Using the `coefs_` attribute from your trained MLP model, plot a bar chart that shows the score of each of the 30 neurons in your model for each class. 

Hint: Plot the scores only for the output layer. You don't need to plot the scores for the hidden layer.

In [None]:
# **TASK 6**: In this task, you will analyze how your MLP features/neurons are behaving with respect to class prediction. To this end, you will examine which neurons in your MLP model have the largest weights when predicting classes.

# 1. Using the `coefs_` attribute from your trained MLP model, plot a bar chart that shows the score of each of the 30 neurons in your model for each class. 

# Hint: Plot the scores only for the output layer. You don't need to plot the scores for the hidden layer.

# Complete

import matplotlib.pyplot as plt
import numpy as np

# Complete

fig, ax = plt.subplots(figsize=(20, 10))
index = np.arange(30)
bar_width = 0.35
opacity = 0.8


# Complete

