# Tweets

## 1 - Import useful modules

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import words
from nltk.metrics.distance import edit_distance
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('words')

[nltk_data] Downloading package punkt to /Users/mfarhi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mfarhi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /Users/mfarhi/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

## 2 - Load and prepare dataset

In [2]:
with open('data/processedPositive.txt') as f:
    pos_data = f.read().splitlines()
with open('data/processedNegative.txt') as f:
    neg_data = f.read().splitlines()
with open('data/processedNeutral.txt') as f:
    neu_data = f.read().splitlines()

processed_positive_df = pd.DataFrame({'tweets': pos_data, 'labels': 1})
processed_negative_df = pd.DataFrame({'tweets': neg_data, 'labels': -1})
processed_neutral_df = pd.DataFrame({'tweets': neu_data, 'labels': 0})

Concatenate all three categories into one dataframe.

In [3]:
df = pd.concat([processed_positive_df, processed_negative_df, processed_neutral_df], ignore_index=True)

Remove duplicates

In [4]:
df.duplicated().sum()

300

In [5]:
df = df[df.duplicated() == False]

Split the dataset into 80% training and 20% test with stratification.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['tweets'], df['labels'], test_size=0.2, stratify = df['labels'], random_state=1337)

Make sure the split was stratified.

In [7]:
df['labels'].value_counts() / len(df)

 0    0.373797
-1    0.318653
 1    0.307550
Name: labels, dtype: float64

In [8]:
y_train.value_counts() / len(y_train)

 0    0.373901
-1    0.318371
 1    0.307728
Name: labels, dtype: float64

In [9]:
y_test.value_counts() / len(y_test)

 0    0.373383
-1    0.319778
 1    0.306839
Name: labels, dtype: float64

## 3 - Preprocessing

We will try different vectorization techniques: binary, word count, TFIDF and bigrams. We will also try different preprocessing techniques: simple tokenization, stemming, lemmatization, stemming + misspellings and lemmatization + misspellings.

In [10]:
porter_stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [11]:
class StemVectorizer(CountVectorizer):
    def build_tokenizer(self):
        tokenizer = super().build_tokenizer()
        return lambda doc: list(map(porter_stemmer.stem, tokenizer(doc)))

In [12]:
class StemTfidfVectorizer(TfidfVectorizer):
    def build_tokenizer(self):
        tokenizer = super().build_tokenizer()
        return lambda doc: list(map(porter_stemmer.stem, tokenizer(doc)))

In [13]:
class LemmaVectorizer(CountVectorizer):
    def build_tokenizer(self):
        tokenizer = super().build_tokenizer()
        return lambda doc: list(map(lemmatizer.lemmatize, tokenizer(doc)))

In [14]:
class LemmaTfidfVectorizer(TfidfVectorizer):
    def build_tokenizer(self):
        tokenizer = super().build_tokenizer()
        return lambda doc: list(map(lemmatizer.lemmatize, tokenizer(doc)))

Build the vocabulary we will be using to correct misspellings.

In [15]:
vocab = {word.lower() for word in words.words()}

We will use caching (memoization) to speed up the correction of misspelled words.

In [16]:
def memoize(f):
    cache = {}
    def memoized_f(*args):
        if args not in cache:
            cache[args] = f(*args)
        return cache[args]
    return memoized_f

@memoize
def correct_word(word):
    distances = [(w, edit_distance(word, w)) for w in vocab if (w[0] == word[0] and w[-1] == word[-1] and len(w) - len(word) in {-1, 0, 1})]
    return min(distances, key=lambda x: x[1])[0] if distances else word

def autocorrect(tokens):
    return [word if (word in vocab or (not word.isalpha())) else correct_word(word) for word in tokens]

In [17]:
class StemAutocorrectVectorizer(CountVectorizer):
    def build_tokenizer(self):
        tokenizer = super().build_tokenizer()
        return lambda doc: autocorrect(list(map(porter_stemmer.stem, tokenizer(doc))))

In [18]:
class StemAutocorrectTfidfVectorizer(TfidfVectorizer):
    def build_tokenizer(self):
        tokenizer = super().build_tokenizer()
        return lambda doc: autocorrect(list(map(porter_stemmer.stem, tokenizer(doc))))

In [19]:
class LemmaAutocorrectVectorizer(CountVectorizer):
    def build_tokenizer(self):
        tokenizer = super().build_tokenizer()
        return lambda doc: autocorrect(list(map(lemmatizer.lemmatize, tokenizer(doc))))

In [20]:
class LemmaAutocorrectTfidfVectorizer(TfidfVectorizer):
    def build_tokenizer(self):
        tokenizer = super().build_tokenizer()
        return lambda doc: autocorrect(list(map(lemmatizer.lemmatize, tokenizer(doc))))

In [21]:
vectorization_techniques = ['binary', 'word_count', 'tfidf', 'bigrams']
preprocessing_techniques = ['just_tokenization', 'stemming', 'lemmatization', 'stemming+misspellings', 'lemmatization+misspellings']
## create dataframe to store results
results = pd.DataFrame(columns=vectorization_techniques, index=preprocessing_techniques)
for v in vectorization_techniques:
    for p in preprocessing_techniques:
        if v == 'word_count' or v == 'binary':
            if p == 'just_tokenization':
                vectorizer = CountVectorizer(binary=(v == 'binary'))
            elif p == 'stemming':
                vectorizer = StemVectorizer(binary=(v == 'binary'))
            elif p == 'lemmatization':
                vectorizer = LemmaVectorizer(binary=(v == 'binary'))
            elif p == 'stemming+misspellings':
                vectorizer = StemAutocorrectVectorizer(binary=(v == 'binary'))
            elif p == 'lemmatization+misspellings':
                vectorizer = LemmaAutocorrectVectorizer(binary=(v == 'binary'))
        elif v == 'tfidf':
            if p == 'just_tokenization':
                vectorizer = TfidfVectorizer()
            elif p == 'stemming':
                vectorizer = StemTfidfVectorizer()
            elif p == 'lemmatization':
                vectorizer = LemmaTfidfVectorizer()
            elif p == 'stemming+misspellings':
                vectorizer = StemAutocorrectTfidfVectorizer()
            elif p == 'lemmatization+misspellings':
                vectorizer = LemmaAutocorrectTfidfVectorizer()
        elif v == 'bigrams':
            if p == 'just_tokenization':
                vectorizer = CountVectorizer(ngram_range=(1, 2))
            elif p == 'stemming':
                vectorizer = StemVectorizer(ngram_range=(1, 2))
            elif p == 'lemmatization':
                vectorizer = LemmaVectorizer(ngram_range=(1, 2))
            elif p == 'stemming+misspellings':
                vectorizer = StemAutocorrectVectorizer(ngram_range=(1, 2))
            elif p == 'lemmatization+misspellings':
                vectorizer = LemmaAutocorrectVectorizer(ngram_range=(1, 2))
        vectorizer.fit(X_train)
        results.loc[p, v] = vectorizer

## 4 - Similarity

We will use the different approaches that we prepared in the task above and cosine similarity to find the top-10 most similar pairs of tweets.

In [22]:
def find_similar_tweets(X_vec, normalized=False, n=10):
    similarity = linear_kernel(X_vec, X_vec) if normalized else cosine_similarity(X_vec, X_vec)
    similarity = pd.DataFrame(similarity)
    similarity = similarity.stack().reset_index()
    similarity.columns = ['tweet1', 'tweet2', 'similarity']
    similarity = similarity[similarity['tweet1'] < similarity['tweet2']]
    similarity = similarity.sort_values(by='similarity', ascending=False)
    similarity = similarity[similarity.duplicated(subset='tweet1') == False]
    similarity = similarity[similarity.duplicated(subset='tweet2') == False]
    top = similarity.head(n)
    return top

In [23]:
# Calculate the top 10 most similar tweets for each combination of vectorization and preprocessing technique
for v in vectorization_techniques:
    for p in preprocessing_techniques:
        print(f'Top 10 most similar tweets using {p} for preprocessing and {v} for vectorization:')
        vectorizer = results.loc[p, v]
        X_train_vectorized = vectorizer.transform(X_train)
        top = find_similar_tweets(X_train_vectorized, normalized=(v == 'tfidf'))
        for i in top.index:
            idx_1 = top['tweet1'][i]
            idx_2 = top['tweet2'][i]
            print(f'Tweet {idx_1}:', X_train.iloc[idx_1])
            print(f'Tweet {idx_2}:', X_train.iloc[idx_2])
            print(f"Similarity: {top['similarity'][i]:.2f}")
        print('-' * 80)

Top 10 most similar tweets using just_tokenization for preprocessing and binary for vectorization:
Tweet 77: thank you! happy
Tweet 546: thank you happy
Similarity: 1.00
Tweet 1917: Thanks for being top engaged community members this week happy  Want this
Tweet 2006: Thanks for being top engaged community members this week happy  Want this ?
Similarity: 1.00
Tweet 1136: When will you notice me? unhappy
Tweet 1934: when will you notice me? unhappy
Similarity: 1.00
Tweet 546: thank you happy
Tweet 1043: Thank you! happy
Similarity: 1.00
Tweet 48: Thank you happy
Tweet 77: thank you! happy
Similarity: 1.00
Tweet 1469: miss you unhappy
Tweet 1728: I miss you unhappy
Similarity: 1.00
Tweet 1141: thanks for the recent follow. Much appreciated happy   Want this ?
Tweet 1744: thanks for the recent follow, much appreciated happy   Want this ?
Similarity: 1.00
Tweet 953: Hello everyone, have a great Thursday! Looking forward to reading your tweets happy  Want this
Tweet 974: Hello everyone, have

## 5 - Machine learning

We will try different algorithms and the different approaches that we prepared before to solve the classification task – sentiment analysis.

In [24]:
# import the models we will be using
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score, classification_report

classifiers = [KNeighborsClassifier(), LogisticRegression(), RandomForestClassifier(random_state=1337), SVC(), ComplementNB()]

In [25]:
# convert classifiers to dictionary
classifiers_dict = {type(c).__name__: c for c in classifiers}
classifiers_dict

{'KNeighborsClassifier': KNeighborsClassifier(),
 'LogisticRegression': LogisticRegression(),
 'RandomForestClassifier': RandomForestClassifier(random_state=1337),
 'SVC': SVC(),
 'ComplementNB': ComplementNB()}

In [26]:
# create dataframe to store results
accuracy = pd.DataFrame(columns=vectorization_techniques, index=preprocessing_techniques)

In [27]:
for classifier in classifiers_dict:
    print(classifier)
    clf = classifiers_dict[classifier]
    for v in vectorization_techniques:
        for p in preprocessing_techniques:
            vectorizer = results.loc[p, v]
            X_train_vectorized = vectorizer.transform(X_train)
            X_test_vectorized = vectorizer.transform(X_test)
            clf.fit(X_train_vectorized, y_train)
            y_pred = clf.predict(X_test_vectorized)
            acc_score = accuracy_score(y_test, y_pred)
            accuracy.loc[p, v] = acc_score
    print(accuracy)
    print()

KNeighborsClassifier
                              binary word_count     tfidf   bigrams
just_tokenization           0.609982   0.621072  0.759704  0.534196
stemming                    0.600739   0.617375  0.759704  0.530499
lemmatization               0.608133   0.622921   0.76525  0.537893
stemming+misspellings       0.604436   0.617375  0.748614  0.528651
lemmatization+misspellings  0.609982   0.619224  0.759704  0.537893

LogisticRegression
                              binary word_count     tfidf   bigrams
just_tokenization           0.922366   0.913124  0.913124  0.911275
stemming                    0.916821   0.911275  0.909427  0.909427
lemmatization               0.911275   0.902033  0.913124   0.90573
stemming+misspellings       0.918669   0.907579  0.911275  0.909427
lemmatization+misspellings  0.909427   0.900185  0.911275   0.90573

RandomForestClassifier
                              binary word_count     tfidf   bigrams
just_tokenization           0.911275   0.924214  0.

Show additional metrics for the best performing algorithm and approach.

In [28]:
vectorizer = results.loc['stemming', 'word_count']
X_train_vectorized = vectorizer.transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

best_classifier = classifiers_dict['ComplementNB']
best_classifier.fit(X_train_vectorized, y_train)
y_pred = best_classifier.predict(X_test_vectorized)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.89      0.95      0.92       173
           0       0.97      0.96      0.96       202
           1       0.95      0.89      0.92       166

    accuracy                           0.94       541
   macro avg       0.94      0.94      0.94       541
weighted avg       0.94      0.94      0.94       541

