In [1]:
import pandas as pd
import seaborn as sns
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report



In [2]:
data = pd.read_csv("./data/train_set.csv")



# Data analysis

- 57% of negative classification cases - Treat as unbalanced or balanced? Why?
- length of the tweet is not a relevant feature
- 69 tweets have at least one duplicate, leading to 110 duplicated tweets - delete manually or automatically?
- not all keywords and locations are available (61 and 2533 values are missing respectively)

Features to investigate:

    1. Tweets length
    2. Average words count 
    3. Average word length 
    

In [None]:
data.head()

In [None]:
data.target.value_counts()

In [None]:
data.isnull().sum()

In [None]:
#char length >140 due to emojis
data["text"].str.len().describe()


In [None]:
sns.distplot(data["text"].str.len())

In [None]:
sns.distplot(data[train.target==1].text.str.len())

In [None]:
sns.distplot(data[train.target==0].text.str.len())


In [None]:
train[data.duplicated(["text"], keep=False)]

In [3]:
data = data.drop_duplicates("text")


# Perform data split

- Why split?
- Why 0.3?
- Why shuffle=false?


In [4]:
train_split, test_split = train_test_split(data, test_size=0.3, shuffle=False)

# Features extraction

We will use the following approaches:

    1. Bag-of-words (CountVectorizer)

    2. TF-IDF (Term frequency-Inverse document frequency)

    3. doc2vec

    4. Custom Linguistic Features (SpyCy)

    

There are 3 methods to perform feature selection:

    1. Univariate Selection
        - Statistical tests are used to select features that have the strongest relationship
    
    2. Feature Importance
        - Assigns a score for each feature of the data, the higher the score the more important/relevant the feature
    
    3. Correlation Matrix with Heatmap
        - States how the features are related to each other or to the target variable

In [10]:
# Bag-of-words (CountVectorizer)

from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()

train_vectors = count_vectorizer.fit_transform(train_split["text"])
test_vectors = count_vectorizer.transform(test_split["text"])

# count_vectorizer.vocabulary_


In [None]:
# TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

train_vectors = tfidf_vectorizer.fit_transform(train_split["text"])
test_vectors = tfidf_vectorizer.transform(test_split["text"])


In [None]:
clf = linear_model.LogisticRegression(solver='liblinear')

In [None]:
scores = model_selection.cross_val_score(clf, train_vectors, train_split["target"], cv=5, scoring='f1')
scores

In [None]:
pred = clf.fit(train_vectors, train_split["target"]).predict(test_vectors)


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

cm = confusion_matrix(test_split['target'], pred)

fig, ax = plt.subplots(figsize=(12,12))
ax.imshow(cm)
ax.grid(False)
ax.set_xlabel('Predicted outputs', fontsize=12, color='black')
ax.set_ylabel('Actual outputs', fontsize=12, color='black')
ax.xaxis.set(ticks=range(2))
ax.yaxis.set(ticks=range(2))
for i in range(2):
    for j in range(2):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='white')
plt.show()
