In [None]:
# Run this cell to set up your notebook
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zipfile
import json

# Ensure that Pandas shows at least 280 characters in columns, so we can see full tweets
pd.set_option('max_colwidth', 280)

%matplotlib inline
plt.style.use('fivethirtyeight')
import seaborn as sns
sns.set()
sns.set_context("talk")
import re

In [None]:
# load tweets and convert data to dataframe
with open('data/2016-2017.json', 'rb') as f:
    old_tweets = json.load(f)
with open('data/2017-2018.json', 'rb') as f:
    new_tweets = json.load(f)
df_old = pd.DataFrame(old_tweets)
df_new = pd.DataFrame(new_tweets)
df_new = df_new.iloc[:10]
df_old = df_old.iloc[:10]

In [None]:
df1 = df_old[['id', 'created_at', 'source', 'text', 'retweet_count']] 
df2 = df_new[['id', 'created_at', 'source', 'full_text', 'retweet_count']]
df2 = df2.rename(columns={'full_text': 'text'})
df1.loc[:, 'id'] = df1['id'].astype('int64')
df2.loc[:, 'id'] = df2['id'].astype('int64')
print(df1.id.dtype)
df1 = df1.set_index('id')
df2 = df2.set_index('id')
df = pd.concat([df1, df2])
df.sort_index()
df.size

In [None]:
df

In [None]:
# separate 'text' column into words
df_sep = df['text'].str.split(expand=True).stack().to_frame().reset_index()
df_sep = df_sep.rename(columns={'level_1': 'num', 0: 'word'})
tmp = df_sep.drop('num', axis=1)
print(len(tmp))
tmp.head(20)

In [None]:
# remove stopwords
import nltk
import nltk.corpus
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)

In [None]:
stop_words.extend(['rt','t','co','https','realdonaldtrump','amp',"u",'hillary','trump2016','trump','clinton','http','ha','wa'])
tmp = tmp[~tmp['word'].isin(stop_words)]
tmp.head(20)

In [None]:
# deal with plurals
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print("rocks: ",lemmatizer.lemmatize("rocks"))
print("better: ",lemmatizer.lemmatize("better", pos = 'a'))
print("studying: ",lemmatizer.lemmatize("studying", pos = 'v'))

In [None]:
nltk.download('punkt')
text = 'studies studying cries cry'
tokenization = nltk.word_tokenize(text)
print("tokenization: ",tokenization)
for w in tokenization:
    print("Lemma for {} is {}".format(w, lemmatizer.lemmatize(w)))

In [None]:
tmp['word'] = tmp.word.apply(lemmatizer.lemmatize)
tmp.head(20)

In [None]:
# get the words with top 5 frequency
top5 = tmp['word'].value_counts(ascending=True).nlargest(5).to_frame()
top5

In [None]:
# fileter the ids to make sure each id contains at least one of the words in top5
tmp2 = tmp[tmp['word'].isin(top5.index)]
tmp2

In [None]:
# get the first 10 unique ids
idlist = tmp2['id'].unique()
idlist = idlist[:10]
idlist

In [None]:
top5.index

In [None]:
df.loc[idlist]

In [None]:
# create the tf-matrix
matrix = np.zeros((10, 5))
words = top5.index
for i in range(10):
    for j in range(5):
        if (words[j]) in df['text'].loc[idlist[i]].split():
            matrix[i][j] += 1
matrix

# PCA

Are there eigenvalues for non-square matrices?

It is not exactly true that non-square matrices can have eigenvalues. Indeed, the definition of an eigenvalue is for square matrices. For non-square matrices, we can define singular values:

Definition: The singular values of a m×n matrix A are the positive square roots of the nonzero eigenvalues of the corresponding matrix ATA. The corresponding eigenvectors are called the singular vectors.


To calculate the SVD for a non-square matrix, we use left singular vectors and right singular vectors for a matrix A.
https://math.stackexchange.com/questions/3982195/what-are-left-and-right-singular-vectors-in-svd

In [None]:
mu = matrix.mean(axis = 0)

matrix_ =matrix - mu

Sigma = matrix_.T @ matrix / len(matrix_)

import scipy.linalg as linalg
u,lam, uh = linalg.svd(Sigma)

In [None]:
u

In [None]:
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
pca.fit(matrix)
print(pca.components_)

In [None]:
print(pca.explained_variance_)

In [None]:
import seaborn as sns
fig = plt.figure( figsize = (20,10))
sns.barplot(x = words, y = u[:,0])

In [None]:
fig = plt.figure( figsize = (20,10))
sns.barplot(x = words, y = u[:,1])

In [None]:
fig = plt.figure( figsize = (20,10))
sns.barplot(x = words, y = u[:,2])

In [None]:
fig = plt.figure( figsize = (20,10))
sns.barplot(x = words, y = u[:,3])

In [None]:
fig = plt.figure( figsize = (20,10))
sns.barplot(x = words, y = u[:,4])