# AGE PREDICTION

In [1]:
import time
code_start = time.time()

# Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import re

In [3]:
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold

In [4]:
import torch
from torch.optim import Adam
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [5]:
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [7]:
torch.cuda.empty_cache()

In [8]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Processing Data

In [9]:
df_initial=pd.read_csv('labeled_tweet_table_Age.csv', encoding='utf8')

In [10]:
df_initial.head()

Unnamed: 0,Tweet,Screen Name,img_path,Under 21
0,@AdvoBarryRoux @GetVidBot,_____zac_____,0,0
1,"The owner of drip doesn't even have 100 mill, ...",_____zac_____,0,0
2,even Lekau the owner of Drip was saying that i...,_____zac_____,0,0
3,"@casspernyovest is cappin that ""R100m"" figure...",_____zac_____,0,0
4,I want a recipe from @JBscotchSA for #JBLemona...,_____zac_____,0,0


In [11]:
df_initial.shape

(106314, 4)

In [12]:
regexMap={r"<[\w'/'\s]*>": "",r"[\'\"\-]+": "",r"@[\w]+":"",r"#[\w]+":"",\
          r"https?:\/\/[\w+@:%._\+~#=]{1,256}\.[\w+()]{1,6}\b([\w+()@:%_\+.~#?&\/\/=]*)":"",\
          r"https?:\/\/[\w+@:%._\+~#=]{1,256}\.[\w+()]{1,6}\b([\w+()@:%_\+.~#?&\/\/=]*)\b(\;\w+\=\w+)":"",\
         r"[\w+@:%._\+~#=]{1,256}\.[\w+()]{1,6}\b([\w+()@:%_\+.~#?&\/\/=]*)":""}
def preprocess(datainput):
    t=datainput
    for regx in regexMap.keys():
        t = re.sub(regx, regexMap[regx], t)
    return t

In [13]:
df_initial["Tweet"]=df_initial["Tweet"].apply(preprocess)

In [14]:
df_initial.head()

Unnamed: 0,Tweet,Screen Name,img_path,Under 21
0,,_____zac_____,0,0
1,"The owner of drip doesnt even have 100 mill, d...",_____zac_____,0,0
2,even Lekau the owner of Drip was saying that i...,_____zac_____,0,0
3,"is cappin that R100m figure is so inflated, ...",_____zac_____,0,0
4,I want a recipe from for ! If youre looking f...,_____zac_____,0,0


In [15]:
df_initial.shape

(106314, 4)

In [16]:
df = df_initial[["Tweet","Screen Name","Under 21"]]

In [17]:
df.head()

Unnamed: 0,Tweet,Screen Name,Under 21
0,,_____zac_____,0
1,"The owner of drip doesnt even have 100 mill, d...",_____zac_____,0
2,even Lekau the owner of Drip was saying that i...,_____zac_____,0
3,"is cappin that R100m figure is so inflated, ...",_____zac_____,0
4,I want a recipe from for ! If youre looking f...,_____zac_____,0


In [18]:
df.shape

(106314, 3)

In [19]:
df['Under 21'].value_counts()

0    67044
1    39270
Name: Under 21, dtype: int64

In [20]:
screen_names_list = df['Screen Name'].unique()

print(screen_names_list[0:5])
print(len(screen_names_list))

['_____zac_____' '___aleia' '___schaeffer___' '__drewc' '__EmilyRice__']
1145


In [21]:
def final_classification_report(cr_list, cm_list):
    n = len(cr_list)
    acc, prec_0, rec_0, f1_0, prec_1, rec_1, f1_1 = 0, 0, 0, 0, 0, 0, 0
    cm = np.zeros((2,2))
    
    for i,cr in enumerate(cr_list):
        acc += cr['accuracy']
        prec_0 += cr['0']['precision']
        rec_0 += cr['0']['recall']
        f1_0 += cr['0']['f1-score']
        prec_1 += cr['1']['precision']
        rec_1 += cr['1']['recall']
        f1_1 += cr['1']['f1-score']
        
        cm += cm_list[i]
    
    print("Overall Accuracy-",round(acc/n,3),"\n")
    print("------(Age >= 21)------\n")
    print("Precision-",round(prec_0/n,3))
    print("Recall-",round(rec_0/n,3))
    print("F1-",round(f1_0/n,3))
    print("\n------(Age < 21)------\n")
    print("Precision-",round(prec_1/n,3))
    print("Recall-",round(rec_1/n,3))
    print("F1-",round(f1_1/n,3))
    print("\nConfusion Matrix-\n",cm)
    

# Naive Bayes Classifier

## Dataset [Concatenate strings for all users]

In [22]:
def df_NB(screen_names,df):
    tweets_dict = dict()

    for i,screen_name in enumerate(screen_names):
    
        tweets_list = df[df['Screen Name']==screen_name]["Tweet"].tolist()
        tweets_dict[i] = [screen_name,' '.join(tweets_list),df[df['Screen Name']==screen_name]["Under 21"].unique()[0]]
    tweets_NB = pd.DataFrame.from_dict(tweets_dict , orient='index')
    tweets_NB = tweets_NB.rename(columns={0: 'Screen Name', 1: 'Tweets', 2: 'Under 21'})
    return tweets_NB

## Vectorizing Words

In [23]:
def Vectorizing(train_tweets_NB, test_tweets_NB,stop_words_vectorizer):
    stop_words_vectorizer.fit(train_tweets_NB["Tweets"].values)
    
    x_input=stop_words_vectorizer.transform(train_tweets_NB["Tweets"].values)
    x_test_input=stop_words_vectorizer.transform(test_tweets_NB["Tweets"].values)
    
    return x_input, x_test_input

## Model and training

In [24]:
def model_NB(train_tweets_NB, test_tweets_NB,nb,stop_words_vectorizer):
    
    x_input, x_test_input = Vectorizing(train_tweets_NB, test_tweets_NB,stop_words_vectorizer)
    
    nb.fit(x_input,train_tweets_NB["Under 21"])
    
    y_pred_train = nb.predict(x_input)
    print("Train accurary-",round(metrics.accuracy_score(train_tweets_NB["Under 21"].values, y_pred_train),3))
    
    y_pred_test = nb.predict(x_test_input)
    print("Test accurary-",round(metrics.accuracy_score(test_tweets_NB["Under 21"].values, y_pred_test),3))
    
    c_report = classification_report(y_true=test_tweets_NB["Under 21"].values,y_pred=y_pred_test,output_dict=True)
    cm = confusion_matrix(test_tweets_NB["Under 21"].values,y_pred_test)
    
    #print("Classification Report\n",classification_report(y_true=test_tweets_NB["Under 21"].values,y_pred=y_pred_test))
    
    return c_report, cm

In [25]:
kf_NB = KFold(n_splits=5, random_state=24)
c_report_list = []
cm_list = []
k=1

stop_words_vectorizer=CountVectorizer(stop_words='english')

for train_tweets_sn, test_tweets_sn in kf_NB.split(screen_names_list):
    
    print(f'K-fold {k}/{5}')
    print('-' * 20)
    
    train_tweets_sn = screen_names_list[train_tweets_sn]
    test_tweets_sn = screen_names_list[test_tweets_sn]
    
    train_tweets_NB = df_NB(train_tweets_sn,df)
    test_tweets_NB = df_NB (test_tweets_sn,df)
    
    nb = MultinomialNB()
    
    a,b = model_NB(train_tweets_NB, test_tweets_NB,nb,stop_words_vectorizer)
    
    c_report_list.append(a)
    cm_list.append(b)
    
    k+=1

K-fold 1/5
--------------------
Train accurary- 0.989
Test accurary- 0.629
K-fold 2/5
--------------------
Train accurary- 0.987
Test accurary- 0.659
K-fold 3/5
--------------------
Train accurary- 0.99
Test accurary- 0.712
K-fold 4/5
--------------------
Train accurary- 0.989
Test accurary- 0.642
K-fold 5/5
--------------------
Train accurary- 0.998
Test accurary- 0.672


In [26]:
final_classification_report(c_report_list,cm_list)

Overall Accuracy- 0.663 

------(Age >= 21)------

Precision- 0.668
Recall- 0.924
F1- 0.774

------(Age < 21)------

Precision- 0.638
Recall- 0.226
F1- 0.327

Confusion Matrix-
 [[663.  55.]
 [331.  96.]]


# BERT Model

## Dataset [Split dataset by users]

In [27]:
def train_df(train_tweets_sn, df):
    train_tweets_df = df[df["Screen Name"]==train_tweets_sn[0]]
    for x in train_tweets_sn[1:]:
        train_tweets_df = train_tweets_df.append(df[df["Screen Name"]==x])
    return train_tweets_df

In [28]:
def test_df(test_tweets_sn,df):
    test_tweets_df = df[df["Screen Name"]==test_tweets_sn[0]]
    for x in test_tweets_sn[1:]:
        test_tweets_df = test_tweets_df.append(df[df["Screen Name"]==x])
    return test_tweets_df

## Dataloader

In [29]:
class Tweet_Dataset(Dataset):
    def __init__(self,dataset,tokenizer,max_len):
        
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len    
        
    def __len__(self):
        return len(self.dataset["Tweet"])
    
    def __getitem__(self, index):
        
        tweet = str(self.dataset.iloc[index,0])
        label = self.dataset.iloc[index,2]
        
        encoding_input = self.tokenizer.encode_plus(tweet,max_length=self.max_len, add_special_tokens=True,\
                                               return_token_type_ids=False,pad_to_max_length=True, return_attention_mask=True,\
                                               return_tensors='pt',truncation=True)
        
        
        return {'tweet':tweet,'label':label,'input_ids':encoding_input['input_ids'].flatten(),\
                'attention_mask':encoding_input['attention_mask'].flatten()} 

In [30]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
token_lens = []
for txt in df["Tweet"]:
    tokens = tokenizer.encode(txt)
    token_lens.append(len(tokens))
print(max(token_lens))

150


## Creating a model

In [31]:
class Classifier(torch.nn.Module):
    def __init__(self):
        
        super(Classifier, self).__init__()
        
        self.bert_model=BertModel.from_pretrained("bert-base-cased")
        
        self.dropout = nn.Dropout(p=0.3)
        
        self.linear = nn.Linear(self.bert_model.config.hidden_size,2) 
        
    def forward(self,input_ids, attention_mask):
        
        last_hidden_layer,pooled_output = self.bert_model(input_ids=input_ids,attention_mask=attention_mask, return_dict=False)
        
        dropout_output = self.dropout(pooled_output)
        
        linear_output = self.linear(dropout_output)
        
        return linear_output

## Training and Testing

In [32]:
def train_loop(dataloader, model, loss_fn, optimizer,device, scheduler):
    
    size = len(dataloader.dataset)
    model=model.train()
    losses=0 
    accuracy=0 
    
    for d in dataloader:
        
        
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['label'].to(device) 
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids,attention_mask=attention_mask)

        _, preds = torch.max(outputs, dim=1)
        
        loss = loss_fn(outputs, targets)

        # Backpropagation
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        
        correct = (preds == targets).float()
        acc=torch.sum(correct)
        accuracy+=acc.item()  
        
        scheduler.step()
        
        losses+=loss.item()   
        
    return accuracy/size, losses/size

In [33]:
def test_loop(dataloader, model, device):
    
    model=model.eval()
    
    predictions = []
    
    with torch.no_grad():
        for d in dataloader:
            
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['label'].to(device)
        
            outputs = model(input_ids=input_ids,attention_mask=attention_mask)
            
            _, preds = torch.max(outputs, dim=1)
            
            predictions = predictions + preds.tolist()
    
    values, counts = np.unique(predictions, return_counts=True)
    ind = np.argmax(counts)
    final_pred = values[ind]
 
    return final_pred

In [34]:
def age_prediction(train_twitter_loader, test_tweets_df, test_tweets_sn, model, loss, optimizer, device, scheduler, epochs, tokenizer, max_len, batch_size):
    
    best_test_acc = 0
    c_report_best = None
    c_matrix_best = None
    
    for t in range(epochs):
        print(f'Epoch {t + 1}/{epochs}')
        print('-' * 10)

        start=time.time()
                    
        train_acc, train_loss = train_loop(train_twitter_loader, model, loss, optimizer, device, scheduler)

        correct_pred = 0
        
        predictions=[]
        target_values=[]
        
        for y in test_tweets_sn:
            
            test_dataset = Tweet_Dataset(test_tweets_df[test_tweets_df["Screen Name"]==y],tokenizer,max_len)
            test_twitter_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
            
            test_pred = test_loop(test_twitter_loader, model, device)
            
            test_label = test_tweets_df[test_tweets_df["Screen Name"]==y]["Under 21"].unique()
            
            if(test_pred==test_label[0]):
                    correct_pred+=1
            
            predictions.append(test_pred)
            target_values.append(test_label[0])
        
        end=time.time()
        print("time taken-",round((end-start)/60.0,2),"minutes")

        print("Train Loss {} | Train Accuracy: {}%".format(round(train_loss, 3), round(train_acc*100, 3)))
        
        test_acc = correct_pred/len(test_tweets_sn)
        print("Test Accuracy: {}%".format(round(test_acc*100, 3)))
        
        c_report = classification_report(y_true=target_values,y_pred=predictions,output_dict=True)
        c_matrix = confusion_matrix(target_values,predictions)
        
        #print("Classification Report\n",classification_report(y_true=target_values,y_pred=predictions))
        
        if test_acc > best_test_acc:
            best_test_acc = test_acc
            # Save the parameters of the model
            c_report_best = c_report
            c_matrix_best = c_matrix
            
    return c_report_best, c_matrix_best

In [35]:
def main(train_tweets_df, test_tweets_df, test_tweets_sn):
    
    learning_rate = 3.1e-5  
    epochs = 2 
    
    MAX_LEN = 160 #180  
    BATCH_SIZE = 64 

    model = Classifier()
    model = model.to(device)
    
    loss=nn.CrossEntropyLoss().to(device)
    optimizer = Adam(model.parameters(), lr=learning_rate)
    
    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    
    train_dataset = Tweet_Dataset(train_tweets_df,tokenizer,MAX_LEN)
    train_twitter_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    total_steps = len(train_twitter_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)
    
    return age_prediction(train_twitter_loader, test_tweets_df, test_tweets_sn, model, loss, optimizer, device, scheduler, epochs, tokenizer, MAX_LEN, BATCH_SIZE)

In [36]:
kf = KFold(n_splits=5, shuffle = True, random_state=24)
c_report_list = []
cm_list = []
k=1

for train_tweets_sn, test_tweets_sn in kf.split(screen_names_list):
    print(f'K-fold {k}/{5}')
    print('-' * 20)
    
    train_tweets_sn = screen_names_list[train_tweets_sn]
    test_tweets_sn = screen_names_list[test_tweets_sn]

    train_tweets_df = train_df(train_tweets_sn, df)
    test_tweets_df = test_df(test_tweets_sn, df)
    c_report, c_matrix = main(train_tweets_df, test_tweets_df, test_tweets_sn)        
    
    c_report_list.append(c_report)
    cm_list.append(c_matrix)
    
    k=k+1

K-fold 1/5
--------------------
Epoch 1/2
----------
time taken- 12.96 minutes
Train Loss 0.01 | Train Accuracy: 64.04%
Test Accuracy: 64.192%
Epoch 2/2
----------
time taken- 12.95 minutes
Train Loss 0.009 | Train Accuracy: 67.838%
Test Accuracy: 70.306%
K-fold 2/5
--------------------
Epoch 1/2
----------
time taken- 12.93 minutes
Train Loss 0.01 | Train Accuracy: 63.711%
Test Accuracy: 68.122%
Epoch 2/2
----------
time taken- 12.92 minutes
Train Loss 0.009 | Train Accuracy: 66.966%
Test Accuracy: 69.869%
K-fold 3/5
--------------------
Epoch 1/2
----------
time taken- 13.0 minutes
Train Loss 0.01 | Train Accuracy: 64.204%
Test Accuracy: 68.122%
Epoch 2/2
----------
time taken- 13.02 minutes
Train Loss 0.009 | Train Accuracy: 68.353%
Test Accuracy: 67.686%
K-fold 4/5
--------------------
Epoch 1/2
----------
time taken- 13.01 minutes
Train Loss 0.01 | Train Accuracy: 64.626%
Test Accuracy: 62.009%
Epoch 2/2
----------
time taken- 13.02 minutes
Train Loss 0.009 | Train Accuracy: 67.95

# Result

In [37]:
final_classification_report(c_report_list, cm_list)

Overall Accuracy- 0.677 

------(Age >= 21)------

Precision- 0.683
Recall- 0.907
F1- 0.779

------(Age < 21)------

Precision- 0.654
Recall- 0.292
F1- 0.401

Confusion Matrix-
 [[651.  67.]
 [303. 124.]]


## Prediction

In [38]:
print("time taken for notebook-",round((time.time()-code_start)/60.0,2))

time taken for notebook- 132.27
