In [601]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import BayesianRidge

In [530]:
## importing all csv data
money = pd.read_csv('money_with_tags.csv').drop('Unnamed: 0', axis=1)
tech = pd.read_csv('medium-tech-data.csv').drop('Unnamed: 0', axis=1)
sports = pd.read_csv('medium-sports-data.csv').drop('Unnamed: 0', axis=1)
politics = pd.read_csv('Politics_data_full.csv').drop('Unnamed: 0', axis=1)

In [531]:
# removing columns in politics that are not in tech
for i in politics.columns:
    if i in tech.columns:
        continue
    else:
        politics = politics.drop(i, axis=1)

In [532]:
# removing columns in tech that are not in politics
for i in tech.columns:
    if i in politics.columns:
        continue
    else:
        tech = tech.drop(i, axis=1)

In [533]:
# removing columns in sports that are not in the other tables
for i in sports.columns:
    if i in politics.columns:
        continue
    else:
        sports = sports.drop(i, axis=1)

In [534]:
# removing columns in money that are not in the other tables
for i in money.columns:
    if i in politics.columns:
        continue
    else:
        money = money.drop(i, axis=1)

In [535]:
def convert_to_int(followers):
    if followers[-1]=='K':
        return 1000* float(followers[:-1])
    return int(followers)
def check_if_k(string):
    if ',' in string:
        return False
    if 'fall' in string:
        return False
    if 'Whether' in string:
        return False
    if string[-1]=='K' and string[:-1].isalpha()==False:
        return True
    elif string[-1]!='K' and string.isalpha()==False:
        return True
    return False
sports = sports.loc[sports['claps'].apply(check_if_k)].reset_index(drop=True)

In [536]:
# clean sports table
sports = sports.dropna().reset_index(drop=True)
sports = sports.loc[list(sports['tags'].apply(lambda x: x[0]=='['))].reset_index(drop=True)
sports['followers']=sports['followers'].apply(convert_to_int)
sports['following']=sports['following'].apply(convert_to_int)
sports['claps'] = sports['claps'].apply(convert_to_int)
sports['published'] = pd.to_datetime(sports['published'])

In [537]:
# clean politics table
tech['followers']=tech['followers'].apply(convert_to_int)
tech['following']=tech['following'].apply(convert_to_int)
tech = tech.loc[tech['claps'].apply(check_if_k)].reset_index(drop=True)
tech['claps']=tech['claps'].apply(convert_to_int)
tech['published'] = pd.to_datetime(tech['published'])

In [538]:
# clean politics table
politics = politics[politics.claps != 2020].reset_index(drop=True)

In [539]:
# concat 
c_table = pd.concat([tech,sports,money, politics],sort=False)

In [732]:
f_table = c_table.drop(['url', 'author', 'published','username','title'],axis=1).reset_index(drop=True)

In [733]:
import re

In [734]:
def tag_words(tags):
    return ' '.join(re.findall("\w+", tags)).lower()

In [735]:
f_table['tags']=f_table['tags'].apply(tag_words)

In [736]:
f_table['text']=f_table['text'].apply(tag_words)

In [737]:
vector = tfidf(stop_words = "english", strip_accents = 'ascii', max_features = 200)

In [738]:
vector.fit(f_table['text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=200,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents='ascii',
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [739]:
text_tfidf = vector.transform(f_table['text']).toarray()

In [740]:
f_table = pd.concat([f_table,pd.DataFrame(text_tfidf, columns=vector.get_feature_names())],axis=1)

In [741]:
vector.fit(f_table['tags'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=200,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents='ascii',
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [742]:
text_tfidf = vector.transform(f_table['tags']).toarray()

In [743]:
f_table = pd.concat([f_table,pd.DataFrame(text_tfidf, columns=vector.get_feature_names())],axis=1)

In [744]:
f_table = f_table.drop(['text','tags',],axis=1)

In [745]:
f_table = f_table.dropna()

In [746]:
X= f_table.drop('claps',axis=1).values

In [747]:
y= list(f_table['claps'])

In [748]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [749]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5)

In [775]:
clf = KNeighborsRegressor(n_neighbors=2)

In [776]:
clf.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                    weights='uniform')

In [777]:
clf.score(X_test, y_test)

-0.9738718767813463

In [778]:
clf.score(X_train, y_train)

0.6707125736604342

In [779]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics

In [780]:
scores = cross_val_score(clf, X_train, y_train, cv=10)

In [781]:
scores

array([ -0.19720387,  -0.26071004,   0.25078938,  -1.24871174,
       -10.08886699,   0.14315178,  -0.01149929,   0.67412624,
         0.27179235,  -0.40803868])