In [75]:
#  /Data/celebs-usa/female contains 381 texts by females
#  /Data/celebs-usa/male contains 912 texts by males
#  /Data/celebs-other-json contains text by

# Identify birth year as that is a constant, these tweets are from 2011-2018, age range constantly changes but birth year stays constant
# using birth year, predict age 10-15, 15-20, 20-25, 25-30, 30-35, 35-40, 45-55,55+  
#{'25-34', '35-44', '45-54', '55-64', '65+'}

from os import listdir, makedirs
from os.path import isfile, join, splitext, split
import json
from collections import Counter
import ftfy
import re
import nltk

def import_celebs_json(folder):
    jsonfiles = [join(folder, f) for f in listdir(folder) if isfile(join(folder, f)) and f.endswith(".json")]
    for jf in jsonfiles:
        with open(jf) as f:
            data = json.load(f)
            handle = data['handle']
            gender = data['gender']
            age_range = data['age_range']
            english = data['english']
            year_of_birth = data['year_of_birth']
            if ( (data['gender'] != 'unknown') and (data['year_of_birth'] != 'unknown')):
                print("Processing " + handle)
                doc = Document({'handle': handle, 'gender': gender, year_of_birth:'year_of_birth', 'age_range': age_range, 'english': english}) #include metadata
                for tweet in data['tweets']:
                    doc.extract_features_from_text(tweet['text'])
                yield doc
            
        
hashtag_re = re.compile(r"#\w+")
mention_re = re.compile(r"@\w+")
url_re = re.compile(r"(?:https?://)?(?:[-\w]+\.)+[a-zA-Z]{2,9}[-\w/#~:;.?+=&%@~]*")

def preprocess(text):
    p_text = hashtag_re.sub("[hashtag]",text)
    p_text = mention_re.sub("[mention]",p_text)
    p_text = url_re.sub("[url]",p_text)
    p_text = ftfy.fix_text(p_text)
    return p_text.lower()

tokenise_re = re.compile(r"(\[[^\]]+\]|[-'\w]+|[^\s\w\[']+)") #([]|words|other non-space)
def tokenise(text):
    return tokenise_re.findall(text)

        
class Document:
    def __init__(self, meta={}):
        self.meta = meta
        self.tokens_fql = Counter() #empty Counter, ready to be added to with Counter.update.
        self.pos_fql = Counter()
        self.pos_list = [] #empty list for pos tags from running text.
        self.num_tokens = 0
        
    def extract_features_from_text(self, text):
        p_text = preprocess(text)
        tokens = tokenise(p_text)
        self.num_tokens += len(tokens)
        self.tokens_fql.update(tokens) #updating Counter counts items in list, adding to existing Counter items.
        pos_tagged = nltk.pos_tag(tokens)
        pos = [tag[1] for tag in pos_tagged]
        self.pos_fql.update(pos)
        self.pos_list.extend(pos)
        
    def extract_features_from_texts(self, texts): #texts should be iterable text lines, e.g. read in from file.
        for text in texts:
            extract_features_from_text(text)
            
    def average_token_length(self):
        sum_lengths = 0
        for key, value in self.tokens_fql.items():
            sum_lengths += len(key) * value
        return sum_lengths / self.num_tokens

In [76]:
'''
# Reading Male and Female Data First
from os import listdir
from os.path import isfile, join, splitext, split

def list_files(folder):
    textfiles = [join(folder, f) for f in listdir(folder) if isfile(join(folder, f)) and f.endswith(".txt")]
    return textfiles
'''

'\n# Reading Male and Female Data First\nfrom os import listdir\nfrom os.path import isfile, join, splitext, split\n\ndef list_files(folder):\n    textfiles = [join(folder, f) for f in listdir(folder) if isfile(join(folder, f)) and f.endswith(".txt")]\n    return textfiles\n'

In [77]:
# Try and open existing pickle file, if not exists then create new pickle file
import pickle
import os

corpus = []
'''
try:
    foo = pickle.load(open("/Data/CelebFile", "rb"))
except as e:
    corpus.extend(import_celebs_json("/home/jay/Documents/AppliedDataMining/FinalProject/Data/celebs-json"))
    with open('/Data/CelebFile', 'wb') as fp:
        pickle.dump(corpus, fp)

'''

if os.path.exists("/Data/CelebFile"):
    with open('CombinedCelebFile', 'rb') as fp:
        corpus = pickle.load(fp)
else:
    corpus.extend(import_celebs_json("/home/jay/Documents/AppliedDataMining/FinalProject/Data/celebs-json"))
    with open('/Data/CelebFile', 'wb') as fp:
        pickle.dump(corpus, fp)
    


Processing WilliamShatner
Processing TheRealSimonCho
Processing ariannahuff
Processing LeonelGOficial
Processing VictoriaCoren
Processing devonkershaw
Processing MrKRudd
Processing Marsha_Thomason
Processing BrentMCM
Processing stephenharper
Processing zaza27
Processing ashabhosle
Processing khoi
Processing anandmahindra
Processing Pchiddy
Processing Schwarzenegger
Processing virsanghvi
Processing Jonnyboy77
Processing duttypaul
Processing gallinari8888
Processing mattdusk
Processing jessicalowndes
Processing SenRehmanMalik
Processing LisaLavie
Processing sanjayjee
Processing DaveNighbor
Processing howiemandel
Processing JulianM
Processing janibrajkovic
Processing cricketaakash
Processing christianmeier
Processing ajaydevgn
Processing russellcrowe
Processing robbiewilliams
Processing ameesha_patel
Processing NelsonPiquet
Processing lilyallen
Processing Cristiano
Processing GordonRamsay
Processing rubarrichello
Processing akshaykumar
Processing CharlizeAfrica
Processing CelinaJaitly
Pro

FileNotFoundError: [Errno 2] No such file or directory: '/Data/CelebFile'

In [37]:
gender_y = [d.meta['gender'] for d in corpus]
gender_X = corpus


for age in set([d.meta['birth_year'] for d in gender_X]):
    yy = [d for d in X if d.meta['age_range'] == age]
    print('age: ' , age , ' : ' , len(yy) )

