# Import libraries, create document class to hold info

In [1]:
#  /Data/celebs-usa/female contains 381 texts by females
#  /Data/celebs-usa/male contains 912 texts by males
#  /Data/celebs-other-json contains text by

# Identify birth year as that is a constant, these tweets are from 2011-2018, age range constantly changes but birth year stays constant
# using birth year, predict age 10-15, 15-20, 20-25, 25-30, 30-35, 35-40, 45-55,55+  
#{'25-34', '35-44', '45-54', '55-64', '65+'}

from os import listdir, makedirs
from os.path import isfile, join, splitext, split
import json
from collections import Counter
import ftfy
import re
import nltk
import copy
from collections import Counter
import seaborn as sns
import pandas as pd
import numpy as np
import ndjson
import pickle
import os
import jsonlines
from sklearn.base import BaseEstimator, TransformerMixin

        
hashtag_re = re.compile(r"#\w+")
mention_re = re.compile(r"@\w+")
url_re = re.compile(r"(?:https?://)?(?:[-\w]+\.)+[a-zA-Z]{2,9}[-\w/#~:;.?+=&%@~]*")

def preprocess(text):
    p_text = hashtag_re.sub("[hashtag]",text)
    p_text = mention_re.sub("[mention]",p_text)
    p_text = url_re.sub("[url]",p_text)
    p_text = ftfy.fix_text(p_text)
    return p_text.lower()

tokenise_re = re.compile(r"(\[[^\]]+\]|[-'\w]+|[^\s\w\[']+)") #([]|words|other non-space)
def tokenise(text):
    return tokenise_re.findall(text)

        
class Document:
    def __init__(self, meta={}):
        self.meta = meta
        self.tokens_fql = Counter() #empty Counter, ready to be added to with Counter.update.
        self.pos_fql = Counter()
        self.pos_list = [] #empty list for pos tags from running text.
        self.num_tokens = 0
        
    def extract_features_from_text(self, text):
        p_text = preprocess(text)
        tokens = tokenise(p_text)
        self.num_tokens += len(tokens)
        self.tokens_fql.update(tokens) #updating Counter counts items in list, adding to existing Counter items.
        pos_tagged = nltk.pos_tag(tokens)
        pos = [tag[1] for tag in pos_tagged]
        self.pos_fql.update(pos)
        self.pos_list.extend(pos)
        
    def extract_features_from_texts(self, texts): #texts should be iterable text lines, e.g. read in from file.
        for text in texts:
            extract_features_from_text(text)
            
    def average_token_length(self):
        sum_lengths = 0
        for key, value in self.tokens_fql.items():
            sum_lengths += len(key) * value
        return sum_lengths / self.num_tokens
    
class DocumentProcessor(BaseEstimator, TransformerMixin):
    def __init__(self, process_method):
        self.process_method = process_method
    
    def fit(self, X, y=None): #no fitting necessary, although could use this to build a vocabulary for all documents, and then limit to set (e.g. top 1000).
        return self

    def transform(self, documents):
        for document in documents:
            yield self.process_method(document)
            
def get_tokens_fql(document):
    return document.tokens_fql

def get_pos_fql(document):
    return document.pos_fql

def get_text_stats(document):
    ttr = len(document.tokens_fql) / document.num_tokens
    return {'avg_token_length': document.average_token_length(), 'ttr': ttr }


def read_list(file):
    with open(file) as f:
        items = []
        lines = f.readlines()
        for line in lines:
            items.append(line.strip())
    return items

fws = read_list("functionwords.txt")

def get_fws_fql(document):
    fws_fql = Counter({t: document.tokens_fql[t] for t in fws}) 
    #dict comprehension, t: fql[t] is token: freq.
    return +fws_fql

def custom_tokenise(text):
    return tokenise_re.findall(text.lower())

def preprocess(text):
    p_text = hashtag_re.sub("[hashtag]",text)
    p_text = mention_re.sub("[mention]",p_text)
    p_text = url_re.sub("[url]",p_text)
    p_text = ftfy.fix_text(p_text)
    return p_text

def confusion_matrix_heatmap(cm, index):
    cmdf = pd.DataFrame(cm, index = index, columns=index)
    dims = (5, 5)
    fig, ax = plt.subplots(figsize=dims)
    sns.heatmap(cmdf, annot=True, cmap="coolwarm", center=0)
    ax.set_ylabel('Actual')    
    ax.set_xlabel('Predicted')

# Reading in Celebrity Data, converting to Document Class and saving to pickle file

In [2]:
'''
This gets the celebrity data and adds the correct gender and ages 
to each json object with text
'''
def getCelebData():
    path = '/home/jay/Downloads/pan19-celebrity-profiling-training-dataset-2019-01-31/feeds.ndjson'
    path2 = '/home/jay/Downloads/pan19-celebrity-profiling-training-dataset-2019-01-31/labels.ndjson'
    x=0
    # Read in the twitter text
    data = []
    with jsonlines.open(path) as reader:
        for obj in reader:
            x+=1
            print('Reading no ',x)
            data.append(obj)
            if len(data) >19999:
                break;
    # Here the correct labels are identified and paried           
    labels = []
    with jsonlines.open(path2) as reader:
        for obj in reader:
            if obj['gender']!='nonbinary':
                for d in data:
                    if d['id'] == obj['id']:
                        d['gender'] = obj['gender']
                        d['birthyear'] = obj['birthyear']
    return data

'''
This function returns the 5 year group a year of birth resides in, e.g. 1995 is between 1995-1999 (inclusive 5 year period)
'''
def getYearRange(yearOfBirth):
    YearGroupGap = 5
    for minYear in range(1900,2015,YearGroupGap):
        maxYear = minYear+YearGroupGap
        #print('min: ',minYear,'max: ',maxYear)
        if (yearOfBirth >= minYear) and (yearOfBirth < maxYear):
            return( str(minYear)+'-'+str(maxYear-1) )
    raise Exception('year of Birth passed in - ' + str(yearOfBirth)+' is not in range of min and max years' ) 
    

'''
This helper function uses the Document class to return a doc
class for each user with the correct gender, age and tweets
'''
def getDocument(data):
    try:
        gender    = data['gender']
        birthyear = data['birthyear']
        if data['birthyear'] != 'unknown':
            birthYearRange = getYearRange(data['birthyear'])

        doc = Document({'gender': gender, 'birthyear':birthyear, 'birthyearrange':birthYearRange}) #include metadata
        for tweet in data['text']:
            doc.extract_features_from_text(tweet)
        return doc
    except:
        print("An exception occurred")

'''
Check if the pickle file exists, if not then create it, else read in
'''
corpus = []
if os.path.exists("/home/jay/Documents/AppliedDataMining/FinalProject/Data/CelebFile"):
    with open('/home/jay/Documents/AppliedDataMining/FinalProject/Data/CelebFile', 'rb') as fp:
        corpus = pickle.load(fp)
    corpus = []
    for i in range(5000):
        path = '/home/jay/Documents/AppliedDataMining/FinalProject/Data/20000Celebs/'
        path += 'Celeb'+str(i)
        with open(path, 'rb') as fp:
            obj = pickle.load(fp)
            if obj is not None:
                corpus.append(obj)
            print('done ',i)

        
    print('CELEB FILE EXISTS')
else:
    print('CELEB FILE DOES NOT EXISTS, CREATING')
    # Call the function to get the twitter data
    corpus = getCelebData()
    print('Read unprocessed, saving to file')
    #with open('/home/jay/Documents/AppliedDataMining/FinalProject/Data/UnProcessedCelebFile', 'wb') as fpc:
    #    pickle.dump(corpus, fpc)
    print('Saved to file')
    #For each json object, convert it to a document object
    for i in range(len(corpus)):
        print('Doing Obj Number: ',i)
        corpus[i] = getDocument(corpus[i])
    
    with open('/home/jay/Documents/AppliedDataMining/FinalProject/Data/CelebFile', 'wb') as fp:
        pickle.dump(corpus, fp)
    print('CELEB FILE CREATED')
print('OUT ERE')

done  0
done  1
done  2
done  3
done  4
done  5
done  6
done  7
done  8
done  9
done  10
done  11
done  12
done  13
done  14
done  15
done  16
done  17
done  18
done  19
done  20
done  21
done  22
done  23
done  24
done  25
done  26
done  27
done  28
done  29
done  30
done  31
done  32
done  33
done  34
done  35
done  36
done  37
done  38
done  39
done  40
done  41
done  42
done  43
done  44
done  45
done  46
done  47
done  48
done  49
done  50
done  51
done  52
done  53
done  54
done  55
done  56
done  57
done  58
done  59
done  60
done  61
done  62
done  63
done  64
done  65
done  66
done  67
done  68
done  69
done  70
done  71
done  72
done  73
done  74
done  75
done  76
done  77
done  78
done  79
done  80
done  81
done  82
done  83
done  84
done  85
done  86
done  87
done  88
done  89
done  90
done  91
done  92
done  93
done  94
done  95
done  96
done  97
done  98
done  99
done  100
done  101
done  102
done  103
done  104
done  105
done  106
done  107
done  108
done  109
done  110


done  898
done  899
done  900
done  901
done  902
done  903
done  904
done  905
done  906
done  907
done  908
done  909
done  910
done  911
done  912
done  913
done  914
done  915
done  916
done  917
done  918
done  919
done  920
done  921
done  922
done  923
done  924
done  925
done  926
done  927
done  928
done  929
done  930
done  931
done  932
done  933
done  934
done  935
done  936
done  937
done  938
done  939
done  940
done  941
done  942
done  943
done  944
done  945
done  946
done  947
done  948
done  949
done  950
done  951
done  952
done  953
done  954
done  955
done  956
done  957
done  958
done  959
done  960
done  961
done  962
done  963
done  964
done  965
done  966
done  967
done  968
done  969
done  970
done  971
done  972
done  973
done  974
done  975
done  976
done  977
done  978
done  979
done  980
done  981
done  982
done  983
done  984
done  985
done  986
done  987
done  988
done  989
done  990
done  991
done  992
done  993
done  994
done  995
done  996
done  997


done  1705
done  1706
done  1707
done  1708
done  1709
done  1710
done  1711
done  1712
done  1713
done  1714
done  1715
done  1716
done  1717
done  1718
done  1719
done  1720
done  1721
done  1722
done  1723
done  1724
done  1725
done  1726
done  1727
done  1728
done  1729
done  1730
done  1731
done  1732
done  1733
done  1734
done  1735
done  1736
done  1737
done  1738
done  1739
done  1740
done  1741
done  1742
done  1743
done  1744
done  1745
done  1746
done  1747
done  1748
done  1749
done  1750
done  1751
done  1752
done  1753
done  1754
done  1755
done  1756
done  1757
done  1758
done  1759
done  1760
done  1761
done  1762
done  1763
done  1764
done  1765
done  1766
done  1767
done  1768
done  1769
done  1770
done  1771
done  1772
done  1773
done  1774
done  1775
done  1776
done  1777
done  1778
done  1779
done  1780
done  1781
done  1782
done  1783
done  1784
done  1785
done  1786
done  1787
done  1788
done  1789
done  1790
done  1791
done  1792
done  1793
done  1794
done  1795

done  2470
done  2471
done  2472
done  2473
done  2474
done  2475
done  2476
done  2477
done  2478
done  2479
done  2480
done  2481
done  2482
done  2483
done  2484
done  2485
done  2486
done  2487
done  2488
done  2489
done  2490
done  2491
done  2492
done  2493
done  2494
done  2495
done  2496
done  2497
done  2498
done  2499
done  2500
done  2501
done  2502
done  2503
done  2504
done  2505
done  2506
done  2507
done  2508
done  2509
done  2510
done  2511
done  2512
done  2513
done  2514
done  2515
done  2516
done  2517
done  2518
done  2519
done  2520
done  2521
done  2522
done  2523
done  2524
done  2525
done  2526
done  2527
done  2528
done  2529
done  2530
done  2531
done  2532
done  2533
done  2534
done  2535
done  2536
done  2537
done  2538
done  2539
done  2540
done  2541
done  2542
done  2543
done  2544
done  2545
done  2546
done  2547
done  2548
done  2549
done  2550
done  2551
done  2552
done  2553
done  2554
done  2555
done  2556
done  2557
done  2558
done  2559
done  2560

done  3236
done  3237
done  3238
done  3239
done  3240
done  3241
done  3242
done  3243
done  3244
done  3245
done  3246
done  3247
done  3248
done  3249
done  3250
done  3251
done  3252
done  3253
done  3254
done  3255
done  3256
done  3257
done  3258
done  3259
done  3260
done  3261
done  3262
done  3263
done  3264
done  3265
done  3266
done  3267
done  3268
done  3269
done  3270
done  3271
done  3272
done  3273
done  3274
done  3275
done  3276
done  3277
done  3278
done  3279
done  3280
done  3281
done  3282
done  3283
done  3284
done  3285
done  3286
done  3287
done  3288
done  3289
done  3290
done  3291
done  3292
done  3293
done  3294
done  3295
done  3296
done  3297
done  3298
done  3299
done  3300
done  3301
done  3302
done  3303
done  3304
done  3305
done  3306
done  3307
done  3308
done  3309
done  3310
done  3311
done  3312
done  3313
done  3314
done  3315
done  3316
done  3317
done  3318
done  3319
done  3320
done  3321
done  3322
done  3323
done  3324
done  3325
done  3326

done  4021
done  4022
done  4023
done  4024
done  4025
done  4026
done  4027
done  4028
done  4029
done  4030
done  4031
done  4032
done  4033
done  4034
done  4035
done  4036
done  4037
done  4038
done  4039
done  4040
done  4041
done  4042
done  4043
done  4044
done  4045
done  4046
done  4047
done  4048
done  4049
done  4050
done  4051
done  4052
done  4053
done  4054
done  4055
done  4056
done  4057
done  4058
done  4059
done  4060
done  4061
done  4062
done  4063
done  4064
done  4065
done  4066
done  4067
done  4068
done  4069
done  4070
done  4071
done  4072
done  4073
done  4074
done  4075
done  4076
done  4077
done  4078
done  4079
done  4080
done  4081
done  4082
done  4083
done  4084
done  4085
done  4086
done  4087
done  4088
done  4089
done  4090
done  4091
done  4092
done  4093
done  4094
done  4095
done  4096
done  4097
done  4098
done  4099
done  4100
done  4101
done  4102
done  4103
done  4104
done  4105
done  4106
done  4107
done  4108
done  4109
done  4110
done  4111

done  4825
done  4826
done  4827
done  4828
done  4829
done  4830
done  4831
done  4832
done  4833
done  4834
done  4835
done  4836
done  4837
done  4838
done  4839
done  4840
done  4841
done  4842
done  4843
done  4844
done  4845
done  4846
done  4847
done  4848
done  4849
done  4850
done  4851
done  4852
done  4853
done  4854
done  4855
done  4856
done  4857
done  4858
done  4859
done  4860
done  4861
done  4862
done  4863
done  4864
done  4865
done  4866
done  4867
done  4868
done  4869
done  4870
done  4871
done  4872
done  4873
done  4874
done  4875
done  4876
done  4877
done  4878
done  4879
done  4880
done  4881
done  4882
done  4883
done  4884
done  4885
done  4886
done  4887
done  4888
done  4889
done  4890
done  4891
done  4892
done  4893
done  4894
done  4895
done  4896
done  4897
done  4898
done  4899
done  4900
done  4901
done  4902
done  4903
done  4904
done  4905
done  4906
done  4907
done  4908
done  4909
done  4910
done  4911
done  4912
done  4913
done  4914
done  4915

# Exploratory Data Analysis

In [None]:
corpus = [d for d in corpus if d.meta['gender'] != 'nonbinary']
#Get all the birth years and plot a histogram
birth_year_y = [d.meta['birthyear'] for d in corpus]
x = pd.Series(birth_year_y, name="Birth Year")
sns.distplot(x)

#Get a count of of the birth years and plot a bar chart
df = pd.DataFrame.from_dict(Counter(birth_year_y), orient='index').reset_index()
df.columns = ['Year','Frequency of people born']
df = df.sort_values(by=['Year'])
df.plot.bar(x='Year', y='Frequency of people born', rot=90,figsize=(10,10), title='The number of people born in each year')
#BirthYearDF = copy.deepcopy(df)


#Get all the birth years and plot a histogram
birth_year_y = [d.meta['birthyearrange'] for d in corpus]
df1 = pd.DataFrame.from_dict(Counter(birth_year_y), orient='index').reset_index()
df1.columns = ['Year','Frequency of people born']
df1 = df1.sort_values(by=['Year'])
df1.plot.bar(x='Year', y='Frequency of people born', rot=90,figsize=(10,10), title='The number of people born in each year')


#Get all the genders and plot a bar chart
gender_y = [d.meta['gender'] for d in corpus]
df2 = pd.DataFrame.from_dict(Counter(gender_y), orient='index').reset_index()
df2.columns = ['Gender','Frequency of Gender']
df2.plot.bar(x='Gender', y='Frequency of Gender', rot=90,figsize=(10,10), title='The number of people born in each year')



# Get Train and Test Split + Resample

In [3]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

'''
Undersample men 
'''
femaleCorpus = [d for d in corpus if d.meta['gender'] == 'female']
maleCorpus   = [d for d in corpus if d.meta['gender'] == 'male']
genderCorpus = maleCorpus[:len(femaleCorpus)] + femaleCorpus
##################THIS NEEDS TO BE DEALT WITh
#Undersample men    

#Getting gender Train and Test
gender_y = [d.meta['gender'] for d in genderCorpus]
gender_X = genderCorpus
Gender_X_train, Gender_X_test, Gender_y_train, Gender_y_test = train_test_split(gender_X,gender_y, test_size=0.3, random_state = 0)
genderCorpus = [d for d in corpus if d.meta['gender'] != 'nonbinary']

'''
Calculate the average year and undersample ages so all the ages that are overrepresented (above the average frequency)
then they are decreased to the average frequency
'''
BirthYearDF = df.reset_index(drop=True)
averageFrequency = round(BirthYearDF['Frequency of people born'].mean())
BirthYearsThatNeedUnderSampling = BirthYearDF.loc[BirthYearDF['Frequency of people born'] > averageFrequency]
BirthYearsThatNeedUnderSampling = BirthYearsThatNeedUnderSampling.set_index('Year') #.T.to_dict('list')
BirthYearsThatNeedUnderSampling = BirthYearsThatNeedUnderSampling.to_dict()
BirthYearsThatNeedUnderSampling = BirthYearsThatNeedUnderSampling.get('Frequency of people born')

itemsToDelete = []
for c in corpus:
    itemBirthYear = c.meta['birthyear']
    if itemBirthYear in BirthYearsThatNeedUnderSampling:
        FrequencyOfRow = BirthYearsThatNeedUnderSampling[itemBirthYear]
        if FrequencyOfRow > averageFrequency:
            itemsToDelete.append(c)
            BirthYearsThatNeedUnderSampling[itemBirthYear] -= 1
            print('Removed one ',itemBirthYear)
print('Done')

    
corpus = [celeb for celeb in corpus if celeb not in itemsToDelete]

#Getting Birth_year_range Train and Test
birth_year_y = [d.meta['birthyearrange'] for d in corpus]
birth_year_y = [d.meta['birthyear'] for d in corpus]
birth_year_X = corpus
Birth_X_train, Birth_X_test, Birth_y_train, Birth_y_test = train_test_split(birth_year_X,birth_year_y, test_size=0.3, random_state = 0)
genderCorpus = [d for d in corpus if d.meta['gender'] != 'nonbinary']




GenderCount = Counter(gender_y)
BirthYearCount = Counter(birth_year_y)
print(GenderCount)
print('---------------')
print(BirthYearCount)

averageFrequency

NameError: name 'df' is not defined

# Model Selection, GridSearch to identify best classifier and best params

In [14]:
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline #,FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate, StratifiedKFold
#from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier, VotingClassifier, BaggingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error, r2_score
#from sklean.metrics import metrics
import math
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn import linear_model
from sklearn.svm import SVC
#from sklearn.feature_extraction import FeatureHasher
import pickle

'''
model = Pipeline([
    ('union', FeatureUnion(
        transformer_list = [
            ('word', Pipeline([
                ('processor', DocumentProcessor(process_method = get_pos_fql)),
                ('vectorizer', DictVectorizer()),
            ])),
        ],
    )),
    ('clf', None), # to be set by grid search.
])

param_grid={ 'clf': [LogisticRegression(solver='liblinear', random_state=0)
                     ,MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
                     RandomForestClassifier(),
                    MLPClassifier(max_iter=400)
                    ],
    
            'union__word__processor__process_method': [get_tokens_fql, get_fws_fql, get_pos_fql, get_text_stats],}


search = GridSearchCV(model, cv = StratifiedKFold(n_splits=5, random_state=0), 
                      return_train_score = False, 
                      scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'],
                      refit = 'f1_weighted',
                      param_grid = param_grid
                     )



print('Fitting Clf')
search.fit(Gender_X_train, Gender_y_train)
print('Getting Predictions')
predictions = search.predict(Gender_X_test)

print("Accuracy: ", accuracy_score(Gender_y_test, predictions))
print(classification_report(Gender_y_test, predictions))
print(confusion_matrix(Gender_y_test, predictions))

confusion_matrix_heatmap(confusion_matrix(Gender_y_test,predictions), ['M','F'])
'''

'\nmodel = Pipeline([\n    (\'union\', FeatureUnion(\n        transformer_list = [\n            (\'word\', Pipeline([\n                (\'processor\', DocumentProcessor(process_method = get_pos_fql)),\n                (\'vectorizer\', DictVectorizer()),\n            ])),\n        ],\n    )),\n    (\'clf\', None), # to be set by grid search.\n])\n\nparam_grid={ \'clf\': [LogisticRegression(solver=\'liblinear\', random_state=0)\n                     ,MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),\n                     RandomForestClassifier(),\n                    MLPClassifier(max_iter=400)\n                    ],\n    \n            \'union__word__processor__process_method\': [get_tokens_fql, get_fws_fql, get_pos_fql, get_text_stats],}\n\n\nsearch = GridSearchCV(model, cv = StratifiedKFold(n_splits=5, random_state=0), \n                      return_train_score = False, \n                      scoring = [\'accuracy\', \'precision_weighted\', \'recall_weighted\', \'f1_weighte

# Predicting Birth Year

In [None]:
clf = DecisionTreeRegressor()
regr = linear_model.LinearRegression()
reg = linear_model.BayesianRidge()

no_estimators= 10

#Ada boost SVR with pos_fql
clf = SVR(gamma='scale', C=1.0, epsilon=0.2)
AdaBoostSVR = BaggingRegressor(clf, n_estimators=no_estimators, random_state=0)
Pipeline_AdaBoostSVR_get_pos_fql = Pipeline([
    ('processor', DocumentProcessor(process_method = get_fws_fql)),
    ('vectorizer', DictVectorizer()),
    ('clf', AdaBoostSVR),
])

#Ada boost Random Forest with pos_fql
clf = RandomForestRegressor(n_estimators=100)
AdaBoostRandomForest = BaggingRegressor(clf, n_estimators=no_estimators, random_state=0)
Pipeline_AdaBoostRandomForest_get_pos_fql = Pipeline([
    ('processor', DocumentProcessor(process_method = get_fws_fql)),
    ('vectorizer', DictVectorizer()),
    ('clf', AdaBoostRandomForest),
])

#Ada boost Bayesian Ridge with pos_fql
clf = linear_model.BayesianRidge()
AdaBoostBayesianRidge = BaggingRegressor(clf, n_estimators=no_estimators, random_state=0)
Pipeline_AdaBoostBayesianRidge_get_pos_fql = Pipeline([
    ('processor', DocumentProcessor(process_method = get_pos_fql)),
    ('vectorizer', DictVectorizer()),
    ('clf', AdaBoostRandomForest),
])

#Ada boost SVR with fws_fql
clf = SVR(gamma='scale', C=1.0, epsilon=0.2)
AdaBoostSVR = BaggingRegressor(clf, n_estimators=no_estimators, random_state=0)
Pipeline_AdaBoostSVR_get_fws_fql = Pipeline([
    ('processor', DocumentProcessor(process_method = get_pos_fql)),
    ('vectorizer', DictVectorizer()),
    ('clf', AdaBoostSVR),
])


'''
Creating Master Pipeline to fit each ensemble and average the result
'''
PipeLineList = [Pipeline_AdaBoostRandomForest_get_pos_fql, 
                Pipeline_AdaBoostSVR_get_pos_fql,
                Pipeline_AdaBoostSVR_get_fws_fql,
                Pipeline_AdaBoostBayesianRidge_get_pos_fql]

predictions= [0] * len(Birth_X_test)
x = 0
print('Fitting clfs')
for pipeline in PipeLineList:
    x+=1
    print('Fitting Clf number ',x)
    pipeline.fit(Birth_X_train, Birth_y_train)
    print('Getting Predictions ', x)
    predictions += pipeline.predict(Birth_X_test)

predictions = predictions / len(PipeLineList)

    
'''
print('Fitting Clf')
BirthYearPipeline.fit(Birth_X_train, Birth_y_train)
print('Saving Model')
filename = 'BirthYearRegression_AdaBoost50.sav'
pickle.dump(BirthYearPipeline, open(filename, 'wb'))
BirthYearPipeline = pickle.load(open(filename, 'rb'))
print('Getting Predictions')
predictions = BirthYearPipeline.predict(Birth_X_test)
predictions = [round(x) for x in predictions]'''

# Evaluating Birth Year Classifier

In [None]:
print('RMSE: ', math.sqrt(mean_squared_error(predictions,Birth_y_test)))
print('MSLE: ', mean_squared_log_error(predictions,Birth_y_test))
print('R2 Score: ', r2_score(predictions,Birth_y_test))
print('MAE: ', mean_absolute_error(predictions,Birth_y_test))


res = pd.DataFrame( data = {'Predictions': predictions, 'Actual': Birth_y_test} )
res[:100].plot( colormap='Paired')

print("Accuracy: ", accuracy_score(Birth_y_test, predictions))
print(classification_report(Birth_y_test, predictions))
print(confusion_matrix(Birth_y_test, predictions))


#labels = list(set(Birth_y_test+predictions))

confusion_matrix_heatmap(confusion_matrix(Birth_y_test,predictions)  )





# Predicting Gender

In [12]:


no_estimators= 1

#Ada boost SVM with pos_fql
#clf = SVC(gamma='auto', verbose=True, algorithm='SAMME')
#clf = SGDClassifier(loss='hinge', algorithm='SAMME')
clf = SVC(probability=True, kernel='linear', verbose=True)
AdaBoostSVM = clf #AdaBoostClassifier(clf, n_estimators=no_estimators, random_state=0)
Pipeline_AdaBoostSVM_get_pos_fql = Pipeline([
    ('processor', DocumentProcessor(process_method = get_tokens_fql)),
    ('vectorizer', DictVectorizer()),
    ('clf', AdaBoostSVM),
])

#Ada boost Random Forest with pos_fql
clf = MLPClassifier(random_state=0, verbose=1, max_iter=50)
AdaBoostMLP = clf #AdaBoostClassifier(clf, n_estimators=no_estimators, random_state=0)
Pipeline_AdaBoostMLP_get_pos_fql = Pipeline([
    ('processor', DocumentProcessor(process_method = get_tokens_fql)),
    ('vectorizer', DictVectorizer()),
    ('clf', AdaBoostMLP),
])

#Ada boost Random Forest with pos_fql
clf = MLPClassifier(random_state=0, verbose=1, max_iter=50)
AdaBoostMLP = clf #AdaBoostClassifier(clf, n_estimators=no_estimators, random_state=0)
Pipeline_AdaBoostMLP_get_FWS_fql = Pipeline([
    ('processor', DocumentProcessor(process_method = get_fws_fql)),
    ('vectorizer', DictVectorizer()),
    ('clf', AdaBoostMLP),
])


#Ada boost Random Forest with pos_fql
clf = LogisticRegression(verbose=1)
AdaBoostMLP = clf #AdaBoostClassifier(clf, n_estimators=no_estimators, random_state=0)
Pipeline_AdaBoostLR_get_FWS_fql = Pipeline([
    ('processor', DocumentProcessor(process_method = get_fws_fql)),
    ('vectorizer', DictVectorizer()),
    ('clf', AdaBoostMLP),
])


#Ada boost Bayesian Ridge with pos_fql
clf = LogisticRegression(verbose=1)
AdaBoostLogisticRegression = clf #AdaBoostClassifier(clf, n_estimators=no_estimators, random_state=0)
Pipeline_AdaBoostLR_get_pos_fql = Pipeline([
    ('processor', DocumentProcessor(process_method = get_tokens_fql)),
    ('vectorizer', DictVectorizer()),
    ('clf', AdaBoostLogisticRegression),
])

#Ada boost SVR with fws_fql
clf = RandomForestClassifier(n_estimators=100, verbose=1)
AdaBoostRandForest = clf #AdaBoostClassifier(clf, n_estimators=no_estimators, random_state=0)
Pipeline_AdaBoostRandForest_get_fws_fql = Pipeline([
    ('processor', DocumentProcessor(process_method = get_tokens_fql)),
    ('vectorizer', DictVectorizer()),
    ('clf', AdaBoostRandForest),
])

'''
ListOfClassifiers = [('MLP', Pipeline_AdaBoostMLP_get_pos_fql),
                     ('SVM', Pipeline_AdaBoostSVM_get_pos_fql ), 
                     ('LR', Pipeline_AdaBoostLR_get_pos_fql), 
                     ('randfor', Pipeline_AdaBoostRandForest_get_fws_fql)], 
                      voting='hard')]

x=0
for clf in ListOfClassifiers:
    x+=1
    print('Fitting Clf: ', x)
    clf.fit(Gender_X_train, Gender_y_train)
    

    print('Getting Predictions')
    predictions = clf.predict(Gender_X_test)

'''
    
GenderPipeline = VotingClassifier(
    estimators=[('MLP', Pipeline_AdaBoostMLP_get_pos_fql),
                ('MLP2', Pipeline_AdaBoostMLP_get_FWS_fql),
                ('LR', Pipeline_AdaBoostLR_get_pos_fql),
                ('LR1', Pipeline_AdaBoostLR_get_pos_fql), 
                ('LR2', Pipeline_AdaBoostLR_get_pos_fql),
                ('LR3', Pipeline_AdaBoostLR_get_FWS_fql),
                ('randfor', Pipeline_AdaBoostRandForest_get_fws_fql)
               ],
                voting='hard')



'''

print('Fitting Clf')
GenderPipeline.fit(Gender_X_train, Gender_y_train)
print('Getting Predictions')
predictions = GenderPipeline.predict(Gender_X_test)
print('Got Predictions')

path = '/home/jay/Documents/AppliedDataMining/FinalProject/Classifiers/GenderEnsemble'
with open(path, 'wb') as fp:
    pickle.dump(GenderPipeline, fp)
    
'''

NameError: name 'SVC' is not defined

# Evaluating Gender Classifier

In [None]:
print("Accuracy: ", accuracy_score(Gender_y_test, predictions))
print(classification_report(Gender_y_test, predictions))
print(confusion_matrix(Gender_y_test, predictions))

#confusion_matrix_heatmap(confusion_matrix(Gender_y_test, predictions), GenderPipeline.classes_)
confusion_matrix_heatmap(confusion_matrix(Gender_y_test, predictions), ['Male', 'Female'])


res = pd.DataFrame( data = {'Predictions': predictions, 'Actual': Gender_y_test} )

res = res.replace(['male', 'female'], [1, 0])

res[:20].plot( colormap='Paired')

math.sqrt(mean_squared_error(predictions,Gender_y_test))

# Stacked Generalisation Meta Classifier

In [None]:
'''
# The stacked generalisation meta classifier (SGMC) is a combination of the gender ensemble and the birth year ensemble

#First the outputs of the of the predicted birth year from each of the bagging regressors need to be saved

#Then the output of predicted gender needs to be saved

#Along with the true birth year

So now a dataset with the following structure is created, where X are the features 
(outputs of the ensembles) and Y is the true birth year.

   X                         Y
1995,1994,1997,1999,1       1995
...................................
..................................
..................................
..................................
............


Once this has been done, the MLP Regressor can then be trained 
using the stacked generalisation method of hold one out

StackedGeneralisationData = pd.DataFrame({"BirthYearPrediction1":[], 
                                        "BirthYearPrediction2":[],
                                        "BirthYearPrediction3":[],
                                        "BirthYearPrediction4":[],
                                        "GenderPrediction":[]
                                        "TrueValue":[]
                                       }) 
'''

GenderEnsemblePath = '/home/jay/Documents/AppliedDataMining/FinalProject/Classifiers/GenderEnsemble'
BirthYearEnsemblePath = '/home/jay/Documents/AppliedDataMining/FinalProject/Classifiers/BirthYearEnsemble'


PipeLineList      = pickle.load(open(BirthYearEnsemblePath, 'rb'))
GenderPipeline    = pickle.load(open(GenderEnsemblePath, 'rb'))

StackedGeneralisationTrainingData = '/home/jay/Documents/AppliedDataMining/FinalProject/Data/StackedGeneralisationTrainingData.csv'
  

#First the outputs of the of the predicted birth year from each of the bagging regressors need to be saved
x = 0
for observation in corpus:
        x+=1
        print('Getting Predictions ', x)
        BirthYearPrediction1 = PipeLineList[0].predict([observation])[0]
        BirthYearPrediction2 = PipeLineList[1].predict([observation])[0]
        BirthYearPrediction3 = PipeLineList[2].predict([observation])[0]
        BirthYearPrediction4 = PipeLineList[3].predict([observation])[0]
        GenderPrediction = GenderPipeline.predict([observation])[0]
        if GenderPrediction == 'male':
            GenderPrediction = 1
        else:
            GenderPrediction = 0
        
        true_value = observation.meta['birthyear']
        
        # Creating the first Dataframe using dictionary 
        StackedGeneralisation_Append = pd.DataFrame({"BirthYearPrediction1":[BirthYearPrediction1], 
                                                "BirthYearPrediction2":[BirthYearPrediction2],
                                                "BirthYearPrediction3":[BirthYearPrediction3],
                                                "BirthYearPrediction4":[BirthYearPrediction4],
                                                "GenderPrediction":[GenderPrediction],
                                                "TrueValue":[true_value]
                                               }) 

        with open(StackedGeneralisationTrainingData, 'a') as f:
            StackedGeneralisation_Append.to_csv(f, header=False)  
        print('done ', x )

        #StackedGeneralisationData.append(StackedGeneralisation_Append) 

        



In [None]:
GenderEnsemblePath = '/home/jay/Documents/AppliedDataMining/FinalProject/Classifiers/GenderEnsemble'
BirthYearEnsemblePath = '/home/jay/Documents/AppliedDataMining/FinalProject/Classifiers/BirthYearEnsemble'


PipeLineList      = pickle.load(open(BirthYearEnsemblePath, 'rb'))
GenderPipeline    = pickle.load(open(GenderEnsemblePath, 'rb'))


def SGMC_Predict(observation):
    BirthYearPrediction1 = PipeLineList[0].predict([observation])[0]
    BirthYearPrediction2 = PipeLineList[1].predict([observation])[0]
    BirthYearPrediction3 = PipeLineList[2].predict([observation])[0]
    BirthYearPrediction4 = PipeLineList[3].predict([observation])[0]
    GenderPrediction = GenderPipeline.predict([observation])[0]
    if GenderPrediction == 'male':
        GenderPrediction = 1
    else:
        GenderPrediction = 0    
    true_value = observation.meta['birthyear']
        
    # Creating the first Dataframe using dictionary 
    StackedGeneralisation_Append = pd.DataFrame({"BirthYearPrediction1":[BirthYearPrediction1], 
                                                "BirthYearPrediction2":[BirthYearPrediction2],
                                                "BirthYearPrediction3":[BirthYearPrediction3],
                                                "BirthYearPrediction4":[BirthYearPrediction4],
                                                "GenderPrediction":[GenderPrediction],
                                        
                                               }) 

    return round(SGMC.predict(StackedGeneralisation_Append))
    


In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.neural_network import MLPRegressor
StackedGeneralisationTrainingDataPath = '/home/jay/Documents/AppliedDataMining/FinalProject/Data/StackedGeneralisationTrainingData.csv'

StackedGeneralisationTrainingData = pd.read_csv(StackedGeneralisationTrainingDataPath)
StackedGeneralisationTrainingData = StackedGeneralisationTrainingData.reset_index(drop=True)

X = StackedGeneralisationTrainingData[['BirthYearPrediction1','BirthYearPrediction2','BirthYearPrediction3','BirthYearPrediction4','GenderPrediction']]
y = StackedGeneralisationTrainingData[['TrueValue']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

SGMC = RandomForestRegressor(n_estimators=300)
print('Training')
SGMC.fit(X_train,y_train)
print('Predicting')
predictions = SGMC.predict(X_test)
print('Done')

# Evaluating Stacked Generalisation Meta Classifier

In [None]:
import math 
print('RMSE: ', math.sqrt(mean_squared_error(predictions,y_test)))
print('MSLE: ', mean_squared_log_error(predictions,y_test))
print('R2 Score: ', r2_score(predictions,y_test))
print('MAE: ', mean_absolute_error(predictions,y_test))



res = pd.DataFrame( data = {'Predictions': predictions, 'Actual': y_test['TrueValue']} )
res[:100].plot( colormap='Paired')


In [None]:
#Testing
path = 'alishapatel28_tweets.json'
path = 'ashnapatel_tweets.json'
    
def getTestData():
    # Read in the twitter text
    data = []
    with open(path) as json_file:  
        data = json.load(json_file)

    return data

def getTestDocument(data):
    try:
        doc = Document({}) #include metadata
        for tweet in data:
            doc.extract_features_from_text(tweet['full_text'])
            #print(tweet['full_text'])
        return doc
    except:
        print("An exception occurred")
        
TestCorpus = getTestData()

TestCorpus = getTestDocument(TestCorpus)
PredictedBirthYear = 0
PredictedGender = ''

'''
for pipeline in PipeLineList:
    PredictedYear = pipeline.predict([TestCorpus])
    print('Predicted Year: ',PredictedYear)
    PredictedBirthYear += PredictedYear
PredictedBirthYear = PredictedBirthYear/len(PipeLineList)
'''

PredictedBirthYear = SGMC_Predict(TestCorpus)

PredictedGender = GenderPipeline.predict([TestCorpus])     
print('Predicted Birth-Year for: ', path, 'is: ', PredictedBirthYear)
print('Predicted Age for ', path, 'is: ', 2018 - PredictedBirthYear )
print('Predicted Gender for: ', path, 'is: ', PredictedGender)



In [None]:

'''
CODE TO UNDERSAMPLE BIRTH YEAR
'''
BirthYearCorpus = copy.deepcopy(corpus)
BirthYearDF = BirthYearDF.reset_index(drop=True)
averageFrequency = round(BirthYearDF['Frequency of people born'].mean())
BirthYearsThatNeedUnderSampling = BirthYearDF.loc[BirthYearDF['Frequency of people born'] > averageFrequency]
BirthYearsThatNeedUnderSampling = BirthYearsThatNeedUnderSampling.set_index('Year') #.T.to_dict('list')
BirthYearsThatNeedUnderSampling = BirthYearsThatNeedUnderSampling.to_dict()
BirthYearsThatNeedUnderSampling = BirthYearsThatNeedUnderSampling.get('Frequency of people born')

indexOfRowsToDelete = []
for i in len(BirthYearCorpus):
    c = BirthYearCorpus[i]
    itemBirthYear = c.meta['birthyear']
    if itemBirthYear in BirthYearsThatNeedUnderSampling:
        FrequencyOfRow = BirthYearsThatNeedUnderSampling[itemBirthYear]
        if FrequencyOfRow > averageFrequency:
            indexOfRowsToDelete.append(i)
            BirthYearsThatNeedUnderSampling[itemBirthYear] -= 1
            print('Removed one 'itemBirthYear)
indexOfRowsToDelete
print('Done')
#delete all the birth years in that index

In [None]:
from sklearn.externals import joblib

for i in range(len(corpus)):
    path = '/home/jay/Documents/AppliedDataMining/FinalProject/Data/20000Celebs/'
    path += 'Celeb'+str(i)
    with open(path, 'wb') as fp:
        pickle.dump(corpus[i], fp)

print('DONE')

In [None]:
from sklearn.model_selection import train_test_split
import random
femaleCorpus = [d for d in corpus if d.meta['gender'] == 'female']
maleCorpus   = [d for d in corpus if d.meta['gender'] == 'male']
genderCorpus = maleCorpus[:len(femaleCorpus)] + femaleCorpus
##################THIS NEEDS TO BE DEALT WITh
#Undersample men    

#genderCorpus = random.sample(genderCorpus, 10000)


#Getting gender Train and Test
gender_y = [d.meta['gender'] for d in genderCorpus]
gender_X = genderCorpus
Gender_X_train, Gender_X_test, Gender_y_train, Gender_y_test = train_test_split(gender_X,gender_y, test_size=0.3, random_state = 0)
gender_X.clear()
maleCorpus.clear()
femaleCorpus.clear()
print(Counter(gender_y))
gender_y.clear()
genderCorpus.clear()
corpus.clear()


In [None]:
def SGMC_Predict(observation):
    BirthYearPrediction1 = PipeLineList[0].predict([observation])[0]
    BirthYearPrediction2 = PipeLineList[1].predict([observation])[0]
    BirthYearPrediction3 = PipeLineList[2].predict([observation])[0]
    BirthYearPrediction4 = PipeLineList[3].predict([observation])[0]
    GenderPrediction = GenderPipeline.predict([observation])[0]
    if GenderPrediction == 'male':
        GenderPrediction = 1
    else:
        GenderPrediction = 0    
        
    # Creating the first Dataframe using dictionary 
    StackedGeneralisation_Append = pd.DataFrame({"BirthYearPrediction1":[BirthYearPrediction1], 
                                                "BirthYearPrediction2":[BirthYearPrediction2],
                                                "BirthYearPrediction3":[BirthYearPrediction3],
                                                "BirthYearPrediction4":[BirthYearPrediction4],
                                                "GenderPrediction":[GenderPrediction],
                                        
                                               }) 

    return SGMC.predict(StackedGeneralisation_Append)

In [None]:
import pandas as pd
path = '/home/jay/Documents/AppliedDataMining/FinalProject/Data/StackedGeneralisationTrainingData.csv'

df = pd.DataFrame({"BirthYearPrediction1":[1995], 
                                                "BirthYearPrediction2":[1991],
                                                "BirthYearPrediction3":[1993],
                                                "BirthYearPrediction4":[1994],
                                                "GenderPrediction":[1],
                                                "TrueValue":[1997]
                                               }) 

with open(path, 'a') as f:
    df.to_csv(f, header=False)

In [41]:
from sklearn.feature_selection import SelectKBest, chi2
clf = RandomForestClassifier(n_estimators=100, verbose=1)
AdaBoostRandForest = clf #AdaBoostClassifier(clf, n_estimators=no_estimators, random_state=0)
Pipeline_AdaBoostRandForest_get_fws_fql = Pipeline([
    ('processor', DocumentProcessor(process_method = get_tokens_fql)),
    ('vectorizer', DictVectorizer()),
    ('selector', SelectKBest(chi2, k=10) ),
    ('clf', AdaBoostRandForest),
    ])
Pipeline_AdaBoostRandForest_get_fws_fql.fit(Gender_X_train,Gender_y_train)
print('Done')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Done


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.6s finished


In [81]:
DP = DocumentProcessor(process_method = get_tokens_fql)
tt = DP.transform(Gender_X_train)
selector = SelectKBest(chi2, k=100)
feats = vectorizer.fit_transform(tt)
filtered = selector.fit_transform(feats, Gender_y_train)

In [83]:
print(filtered)

  (0, 99)	121.0
  (0, 98)	13.0
  (0, 97)	3.0
  (0, 96)	1.0
  (0, 95)	4.0
  (0, 94)	143.0
  (0, 93)	301.0
  (0, 33)	551.0
  (0, 32)	49.0
  (0, 31)	27.0
  (0, 30)	10.0
  (0, 29)	799.0
  (0, 28)	31.0
  (0, 27)	9.0
  (0, 26)	199.0
  (0, 25)	27.0
  (0, 24)	273.0
  (0, 23)	65.0
  (0, 22)	123.0
  (0, 21)	88.0
  (0, 20)	26.0
  (0, 19)	68.0
  (0, 18)	12.0
  (0, 17)	4.0
  (0, 16)	15.0
  :	:
  (2220, 24)	305.0
  (2220, 23)	40.0
  (2220, 22)	238.0
  (2220, 21)	129.0
  (2220, 20)	9.0
  (2220, 19)	44.0
  (2220, 18)	27.0
  (2220, 17)	12.0
  (2220, 16)	35.0
  (2220, 15)	16.0
  (2220, 14)	7.0
  (2220, 13)	3.0
  (2220, 12)	71.0
  (2220, 11)	2104.0
  (2220, 10)	3061.0
  (2220, 9)	1024.0
  (2220, 8)	51.0
  (2220, 7)	43.0
  (2220, 6)	30.0
  (2220, 5)	486.0
  (2220, 4)	397.0
  (2220, 3)	10.0
  (2220, 2)	46.0
  (2220, 1)	101.0
  (2220, 0)	635.0
