In [None]:
import json
from pprint import pprint
import pandas as pd

from nltk.tokenize import TweetTokenizer

import nltk
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

from pathlib import Path

# Constructing the basic dataset

In [None]:
data_folder = Path('../data/twint')
tweets_file = data_folder / '2019_april_may_#uber_EN.txt'

In [None]:
f = open(tweets_file, 'r')
lines = [l for l in f]
f.close()
tweets = [json.loads(l) for l in lines]
unique_usernames = {t['username'] for t in tweets}

In [None]:
print('Number of tweets: %s' % len(tweets))

## Top 10 Hashtags

In [None]:
hashtags = [h for t in tweets for h in t['hashtags']]

In [None]:
from collections import Counter
hashtags_count = pd.DataFrame.from_dict(Counter(hashtags), orient='index', columns=['count']).sort_values(by='count', ascending=False)[:10]
hashtags_count

# Text preparation

## Concatening all the tweets (= document) for each user

In [None]:
users_to_tweets_list = {u:[] for u in list(unique_usernames)}

for t in tweets:
    username = t['username']
    users_to_tweets_list[username].append(t['tweet'].replace('\n', ''))

users_to_documents = {u:'' for u in list(unique_usernames)}
for u in unique_usernames:
    tweets_list = users_to_tweets_list[u]
    user_document = ''.join(tweets_list)
    users_to_documents[u] = user_document

## Tokenizing

In [None]:
tokenizer = TweetTokenizer()

In [None]:
users_to_documents = {u:tokenizer.tokenize(doc) for (u, doc) in users_to_documents.items()}

In [None]:
list(users_to_documents.items())[0]

## Remove non-alpha characters and lowercase

In [None]:
list(users_to_documents.items())[0]

In [None]:
for u, d in users_to_documents.items():
    filtered = [token.lower() for token in d if token.isalpha()]
    users_to_documents[u] = filtered

In [None]:
list(users_to_documents.items())[0]

##  Filter out stopwords

In [None]:
stop_words = stopwords.words('english')

for u, d in users_to_documents.items():
    filtered = [token.lower() for token in d if token not in stop_words]
    users_to_documents[u] = filtered

## Stemming

In [None]:
from nltk.stem import PorterStemmer

In [None]:
stop_words = stopwords.words('english')
stemmer = PorterStemmer()

for u, d in users_to_documents.items():
    filtered = [stemmer.stem(token) for token in d]
    users_to_documents[u] = filtered

In [None]:
len(users_to_documents)

## See document length distribution

In [None]:
lengths = {u:len(doc) for (u, doc) in users_to_documents.items()}
lengths_df = pd.DataFrame.from_dict(lengths, columns=['length'], orient='index').sort_values(by='length', ascending=False)[:30]
lengths_df

## Filter out any users that now have less than 100 words

In [None]:
c = {u:doc for (u, doc) in users_to_documents.items() if len(doc) > 100}
users_to_documents_filtered = c
len(users_to_documents_filtered)

# Constructing the Term/Document Matrix

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
docs = users_to_documents_filtered.values()

In [None]:
def do_nothing(tokens):
    return tokens

vectorizer = CountVectorizer(lowercase=False, tokenizer=do_nothing)

X = vectorizer.fit_transform(docs)

In [None]:
docs_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [None]:
docs_df

## Performing a PCA

In [None]:
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from sklearn import preprocessing

## Normalize data

In [None]:
data_scaled = pd.DataFrame(preprocessing.scale(docs_df), columns=docs_df.columns)

## Perform the PCA

In [None]:
pca = PCA(n_components=10)
X_reduced = pca.fit_transform(data_scaled)

## Analyze the percentage of explained variance per prinical component

In [None]:
pd.DataFrame(pca.explained_variance_ratio_)

## 2D Plot

In [None]:
y = range(0, len(X_reduced))
plt.figure(1, figsize=(10, 10))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, cmap=plt.cm.Set2, edgecolor='k')
plt.title('2D Plot')

## 3D Plot

In [None]:
fig = plt.figure(1, figsize=(10, 10))
ax = Axes3D(fig, elev=-150, azim=110)
y = range(0, len(X_reduced))
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=y,
           cmap=plt.cmScreenshot from 2020-07-24 18-14-43.Set1, edgecolor='k')
ax.set_title("PCA 3D Plot")

## See the most important features per principal component