In [169]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [251]:
tech = pd.read_csv("/Users/jennayang/Documents/Medium Project Data/medium-tech-data.csv")
money = pd.read_csv("/Users/jennayang/Documents/Medium Project Data/Medium_Money_Data_final.csv", index_col = 0) 
sports = pd.read_csv("/Users/jennayang/Documents/Medium Project Data/medium-sports-data.csv", index_col = 0) 
politics = pd.read_csv("/Users/jennayang/Documents/Medium Project Data/Politics_data_full.csv", index_col = 0) 

In [252]:
#there was a bug with the webscraping: people with 2020 claps have the wrong number of claps. 
#checked via website, having differing number of claps
politics = politics[politics.claps != 2020]

#get ratio of clap to followers, put as a new feature
clap_ratio = politics.claps / politics.followers

#get number of followers:
followers = politics.followers

#put the clap ratio as a new feature
politics["clap_ratio"] = clap_ratio



In [253]:
#cleaning the data - removing nulls and empty vals
tech = tech[pd.notnull(tech['text'])]
tech.reset_index(inplace = True)

sports = sports[pd.notnull(sports['text'])]
sports.reset_index(inplace = True)

politics = politics[pd.notnull(politics['text'])]
politics.reset_index(inplace = True)

money = money[pd.notnull(money['text'])] 
money.reset_index(inplace = True)

politics = politics[~(politics.text == '[]')]
money = money[~(money.text == '[]')]
sports = sports[~(sports.text == '[]')]
tech = tech[~(tech.text == '[]')]

#shuffle and randomly select 2000 entries (downsampling tech and politics) (money keep at 1600) (sports keep at 2500)
str1 = tech.sample(2000, replace= False)[["text"]].reset_index(drop=True)
str2 = sports.sample(len(sports), replace= False)[["text"]].reset_index(drop=True)
str3 = politics.sample(2000, replace= False)[["text"]].reset_index(drop=True)
str4 = money.sample(len(money), replace= False)[["text"]].reset_index(drop=True)

In [254]:
#Breaking our data up into training and test sets (80/20 split)
pd.options.mode.chained_assignment = None #surpress warning

#first, we need to add the labels
str1["label"] = [0] * len(str1)
str2["label"] = [1] * len(str2)
str3["label"] = [2] * len(str3)
str4["label"] = [3] * len(str4)


#making the training set and test sets
str2_break = int(len(str2) * 0.8)
str4_break = int(len(str4) * 0.8)
str1_train = str1.iloc[:1600] #tech - 2000
str2_train = str2.iloc[:str2_break] 
str3_train = str3.iloc[:1600] #politics - 2000
str4_train = str4.iloc[:str4_break]

str1_test = str1.iloc[1600:] #tech - 2000
str2_test = str2.iloc[str2_break:] 
str3_test = str3.iloc[1600:] #politics - 2000
str4_test = str4.iloc[str4_break:]

In [255]:
#making the combined training sets and test sets
one = pd.concat([str1_train, str2_train], axis = 0)
two = pd.concat([str3_train, str4_train], axis = 0)
train = pd.concat([one, two], axis = 0)

first = pd.concat([str1_test, str2_test], axis = 0)
second = pd.concat([str3_test, str4_test], axis = 0)
test = pd.concat([first, second], axis = 0)

In [256]:
#shuffling
train = train.sample(frac = 1, replace = False).reset_index(drop=True)
test = test.sample(frac = 1, replace = False).reset_index(drop=True)

#get X and Y data from the sets
all_words = train.text
Y_training = train.label

all_words_test = test.text
Y_test = test.label


In [257]:
train.head()

Unnamed: 0,text,label
0,"The story is part of the SF Homeless Project, ...",0
1,['[This is especially true as you get closer t...,3
2,I was going to lead off this post with Miguel ...,1
3,['[I love the New York Times. Or loved. I canc...,2
4,"Three Saturdays ago, a 19-year-old man posted ...",0


In [258]:
#create the vector
#focus on max_features as well
vector = tfidf(stop_words = "english", strip_accents = 'ascii', max_features = 50)

#fit the data
vector.fit(all_words)

#can look at the vectorizer
print(vector.get_feature_names())


['000', 'american', 'best', 'better', 'day', 'did', 'does', 'dont', 'financial', 'game', 'going', 'good', 'government', 'im', 'income', 'just', 'know', 'life', 'like', 'long', 'make', 'market', 'money', 'need', 'new', 'pay', 'people', 'point', 'political', 'president', 'really', 'right', 'said', 'say', 'state', 'states', 'team', 'thats', 'things', 'think', 'time', 'trump', 'use', 'want', 'way', 'work', 'world', 'year', 'years', 'youre']


In [265]:
#transforming the data, manipulating it based on the weights we 
#determined before by fitting our data 
train_transformed = vector.transform(all_words)
test_transformed = vector.transform(all_words_test)

In [260]:
train_transformed

<6537x50 sparse matrix of type '<class 'numpy.float64'>'
	with 88969 stored elements in Compressed Sparse Row format>

In [266]:
#splitting
train_test_split(train_transformed, Y_training)[0], train_test_split(train_transformed, Y_training)[1]

(<4902x50 sparse matrix of type '<class 'numpy.float64'>'
 	with 67469 stored elements in Compressed Sparse Row format>,
 <1635x50 sparse matrix of type '<class 'numpy.float64'>'
 	with 22910 stored elements in Compressed Sparse Row format>)

In [277]:
#create the KNN model
knearest = knn(n_neighbors = 10)

#fit the model with training data
knearest.fit(train_transformed, Y_training)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [278]:
knearest.score(test_transformed, Y_test)

0.7314984709480122

# Hyperparameter Tuning

In [275]:
#Trying different k values
knn_scores = []

for i in range(30):
    knearest = knn(n_neighbors = i+1)
    knearest.fit(train_transformed, Y_training)
    
    knn_scores.append(knearest.score(test_transformed, Y_test))

In [276]:
knn_scores.index(max(knn_scores))+1

10

KNearestNeighbors Notes

unsuprevised, so we don't know the labels (talk about this fact)
try different values of k



Make a model called Kmeans from sklearn.cluster import kmeans

Inertia of the model: how close the clusters are to one another, how clustered the clusters are

Make n clusters = k, increase the k value from 0 to certain #, fit X data, append to list the inertia

look up: elbow method for optimal value of k in kmeans


KNN wouldn't be the best method for us because we do have the labels, so we can just use supervised.

keep in mind: keep max features low b/c curse of dimensionality


Graphs: graph of the k's (elbow).

Analysis: compare distortions vs inertia, analyze the clusters, what's inside, which ones tend to get clustered together or not, 
