In [192]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [155]:
tech = pd.read_csv("/Users/jennayang/Documents/Medium Project Data/medium-tech-data.csv")
money = pd.read_csv("/Users/jennayang/Documents/Medium Project Data/Medium_Money_Data_final.csv", index_col = 0) 
sports = pd.read_csv("/Users/jennayang/Documents/Medium Project Data/medium-sports-data.csv", index_col = 0) 
politics = pd.read_csv("/Users/jennayang/Documents/Medium Project Data/Politics_data_full.csv", index_col = 0) 

In [156]:
#code from Jesse's tfidf_model_demo
#there was a bug with the webscraping: people with 2020 claps have the wrong number of claps. 
#checked via website, having differing number of claps
politics = politics[politics.claps != 2020]

#get ratio of clap to followers, put as a new feature
clap_ratio = politics.claps / politics.followers

#get number of followers:
followers = politics.followers

#put the clap ratio as a new feature
politics["clap_ratio"] = clap_ratio

In [157]:
#cleaning the data
tech = tech[pd.notnull(tech['text'])]
tech = tech.sample(n = 3000) #randomly sample 3000 articles
tech.reset_index(inplace = True)

sports = sports[pd.notnull(sports['text'])]
sports.reset_index(inplace = True)

#politics = politics[pd.notnull(tech['text'])]
#politics.reset_index(inplace = True)

#remove Pixabay 
#for i in range(len(politics)):
#    if 'Pixabay' in politics.text[i]:
#        politics.at[i, 'text'] = politics.text[i].replace('Pixabay', '')

In [165]:
all_str = pd.concat([tech.text[:1875], sports.text[:1875]]).reset_index(drop=True)
all_str

0       Virtual Reality (VR) experiences that are made...
1       Speculative fiction is my favorite genre to re...
2       While Facebook and Twitter are the dominant pl...
3       By Mark Skapinker I’ve been fortunate enough t...
4       You might think capping your data plan would s...
                              ...                        
3745    By Kieran O’Dwyer The following article is fro...
3746    Hello again, my friend! I hope all has gone we...
3747    The most important stretch of baseball is comi...
3748    By now, everyone’s seen Nike’s latest Just Do ...
3749    The Jets missed out on two-time All-Pro and fo...
Name: text, Length: 3750, dtype: object

In [176]:
#create the vector
#focus on max_features as well
vector = tfidf(stop_words = "english", strip_accents = 'ascii', max_features = 50)
#porter stemmer???

#fit the data
vector.fit(all_str)

#can look at the vectorizer
print(vector.get_feature_names())


['ball', 'baseball', 'best', 'better', 'big', 'data', 'day', 'did', 'dont', 'fans', 'football', 'game', 'games', 'going', 'good', 'hes', 'home', 'im', 'just', 'know', 'league', 'like', 'long', 'make', 'nba', 'need', 'new', 'people', 'play', 'player', 'players', 'point', 'right', 'run', 'said', 'season', 'sports', 'team', 'teams', 'thats', 'things', 'think', 'time', 'use', 'way', 'win', 'work', 'world', 'year', 'years']


In [177]:
#transforming the data, aka manipulating it based on the weights we 
#determined before by fitting our data 
train = vector.transform(all_str)

In [178]:
train

<3750x50 sparse matrix of type '<class 'numpy.float64'>'
	with 21621 stored elements in Compressed Sparse Row format>

In [179]:
train_Y = [0] * 1875 + [1] * 1875
train_Y

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [180]:
#create the test set
test1 = tech.text.iloc[1875:2500]
test2 = sports.text.iloc[1875:2500]
#test1
test_set = pd.concat([test1, test2]).reset_index(drop=True)

In [181]:
#still need to preprocess (in string form)
test_trans = vector.transform(test_set)

In [182]:
test_trans

<1250x50 sparse matrix of type '<class 'numpy.float64'>'
	with 13115 stored elements in Compressed Sparse Row format>

In [183]:
# need Y (aka labels since this is supervised)

test_Y = [0] * 625 + [1] * 625

In [185]:
train_Y

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [193]:
#create the model (look at the documentation to see if there's any 
# parameters) you should pay closer attention to.

#this is how you create a basic decision tree
dt = DecisionTreeClassifier()
knearest = knn(n_neighbors=5)
logisticRegr = LogisticRegression()


#fit the model, use your train data
dt.fit(train, train_Y)
knearest.fit(train, train_Y)
logisticRegr.fit(train, train_Y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [188]:
dt.score(test_trans, test_Y)

0.8544

In [191]:
knearest.score(test_trans, test_Y)

0.8568

In [194]:
logisticRegr.score(test_trans, test_Y)

0.8944