In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import nltk
from nltk.tokenize import word_tokenize

# Celebrities dataset

In [None]:
# Age --> age categories

df = pd.read_csv('C:/Users/Jens/Notes Jupyter/Thesis-files/celebrities.csv', encoding='utf8')

labels = ['18-24','25-34','35-46','50-64','65-xx']
age_cats = []

for x in df['age']:

    if x in range(18,25):
        x = labels[0]
    elif x in range(25,35):
        x = labels[1]
    elif x in range(35,50):
        x = labels[2]
    elif x in range(50,66):
        x = labels[3]
    elif x > 65:
        x = labels[4]
    age_cats.append(x)
#df['age_cat'] = pd.Categorical(df['age_cat'])

age_cats = np.array(age_cats)
df['age_cat'] = age_cats

df = df.drop('age', 1)
df['age_cat'].value_counts()

In [None]:
# BOW base model
tfidfvect = TfidfVectorizer(analyzer ='word', ngram_range = (1,1))

X = tfidfvect.fit_transform(df['text'])
y = df['age_cat']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

logreg = LogisticRegression() 
logreg.fit(X_train, y_train)

y_test_pred = logreg.predict(X_test)

print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

In [None]:
#balanced dataset:
balanced_df = df.groupby('age_cat')
balanced_df = pd.DataFrame(balanced_df.apply(lambda x: x.sample(balanced_df.size().min()).reset_index(drop=True)))

In [None]:
# balanced data - with optimised features
tfidfvect = TfidfVectorizer(max_features=55000, 
                            ngram_range=(1,1),
                            lowercase=False,
                            analyzer='word', 
                            binary=True,
                            tokenizer=word_tokenize,
                            token_pattern='(?u)\\b\\w+\\b')

X = tfidfvect.fit_transform(balanced_df['text'])
y = balanced_df['age_cat']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

logreg = LogisticRegression()
logreg.fit(X, y)

y_test_pred = logreg.predict(X_test)

print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

#should reach accuracy scores up to ~ 80%

In [None]:
# balanced data - crossval
tfidfvect = TfidfVectorizer(max_features=30000, 
                            ngram_range=(3,3),
                            lowercase=False,
                            analyzer='char', 
                            binary=False,
                            tokenizer=word_tokenize)

X = tfidfvect.fit_transform(balanced_df['text'])
y = balanced_df['age_cat']

logreg = LogisticRegression()

scores = cross_val_score(logreg, X, y, cv=10, scoring = 'accuracy')
print("Accuracy:", scores)
print ("Mean:", scores.mean())

# these scores are drastically lower