# Tweets

## 1 - Import useful modules

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/mfarhi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 2 - Load and prepare dataset

In [2]:
with open('data/processedPositive.txt') as f:
    pos_data = f.read().splitlines()
with open('data/processedNegative.txt') as f:
    neg_data = f.read().splitlines()
with open('data/processedNeutral.txt') as f:
    neu_data = f.read().splitlines()

processed_positive_df = pd.DataFrame({'tweets': pos_data, 'labels': 1})
processed_negative_df = pd.DataFrame({'tweets': neg_data, 'labels': -1})
processed_neutral_df = pd.DataFrame({'tweets': neu_data, 'labels': 0})

Concatenate all three categories into one dataframe.

In [3]:
df = pd.concat([processed_positive_df, processed_negative_df, processed_neutral_df], ignore_index=True)

Split the dataset into 80% training and 20% test with stratification.

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df['tweets'], df['labels'], test_size=0.2, stratify = df['labels'], random_state=1337)

Make sure the split was stratified.

In [5]:
df['labels'].value_counts() / len(df)

 0    0.338108
-1    0.331446
 1    0.330446
Name: labels, dtype: float64

In [6]:
y_train.value_counts() / len(y_train)

 0    0.338192
-1    0.331529
 1    0.330279
Name: labels, dtype: float64

In [7]:
y_test.value_counts() / len(y_test)

 0    0.337770
 1    0.331115
-1    0.331115
Name: labels, dtype: float64

## 3 - Preprocessing

In [8]:
#X_train = X_train.apply(word_tokenize)

In [9]:
binary_vectorizer = CountVectorizer(binary=True)
X_train_vec = binary_vectorizer.fit_transform(X_train)

In [10]:
X_train_vec

<2401x5533 sparse matrix of type '<class 'numpy.int64'>'
	with 24838 stored elements in Compressed Sparse Row format>

In [11]:
binary_vectorizer.get_feature_names_out()[-10:]

array(['yoyour', 'yoyoyou', 'yr', 'yummy', 'yura', 'zabardast', 'zac',
       'zcc', 'zoo', 'zoos'], dtype=object)

In [12]:
binary_vectorizer.transform(['00 00 zoo 000 zcc zoo zoos I love this movie']).toarray()

array([[1, 1, 0, ..., 1, 1, 1]])

In [13]:
word_count_vectorizer = CountVectorizer()
X_train_vec = word_count_vectorizer.fit_transform(X_train)
word_count_vectorizer.get_feature_names_out()[-10:]

array(['yoyour', 'yoyoyou', 'yr', 'yummy', 'yura', 'zabardast', 'zac',
       'zcc', 'zoo', 'zoos'], dtype=object)

In [14]:
word_count_vectorizer.transform(['00 00 zoo 000 zcc zoo zoos I love this movie']).toarray()

array([[2, 1, 0, ..., 1, 2, 1]])

In [15]:
tfidf_vectorizer = TfidfVectorizer()
X_train_vec = tfidf_vectorizer.fit_transform(X_train)
tfidf_vectorizer.get_feature_names_out()[-10:]

array(['yoyour', 'yoyoyou', 'yr', 'yummy', 'yura', 'zabardast', 'zac',
       'zcc', 'zoo', 'zoos'], dtype=object)

In [16]:
tfidf_vectorizer.transform(['00 00 zoo 000 zcc zoo zoos I love this movie']).toarray()

array([[0.55799809, 0.27899905, 0.        , ..., 0.29371835, 0.55799809,
        0.29371835]])

## 4 - Similarity

In [17]:
similarity = linear_kernel(X_train_vec, X_train_vec)
## find the top 10 most similar pairs of tweets

"similarity_df = pd.DataFrame(similarity)\nsimilarity_df = similarity_df.stack().reset_index()\nsimilarity_df.columns = ['tweet1', 'tweet2', 'similarity']\nsimilarity_df = similarity_df[similarity_df['tweet1'] != similarity_df['tweet2']]\nsimilarity_df = similarity_df.sort_values(by='similarity', ascending=False)\nsimilarity_df.head(10)"

## 5 - Machine learning