# **IMPORT LIBRARIES**

In [6]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.applications import VGG16
from tensorflow.keras.layers import LSTM, Dense, Dropout, Flatten, GlobalAveragePooling2D, Input, Concatenate
from tensorflow.keras.models import Model
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split


# **MENTAL HEALTH TWITTER DATASET LOAD**

In [7]:
data = pd.read_csv('/content/Mental-Health-Twitter.csv')
data

Unnamed: 0.1,Unnamed: 0,post_id,post_created,post_text,user_id,followers,friends,favourites,statuses,retweets,label
0,0,637894677824413696,Sun Aug 30 07:48:37 +0000 2015,It's just over 2 years since I was diagnosed w...,1013187241,84,211,251,837,0,1
1,1,637890384576778240,Sun Aug 30 07:31:33 +0000 2015,"It's Sunday, I need a break, so I'm planning t...",1013187241,84,211,251,837,1,1
2,2,637749345908051968,Sat Aug 29 22:11:07 +0000 2015,Awake but tired. I need to sleep but my brain ...,1013187241,84,211,251,837,0,1
3,3,637696421077123073,Sat Aug 29 18:40:49 +0000 2015,RT @SewHQ: #Retro bears make perfect gifts and...,1013187241,84,211,251,837,2,1
4,4,637696327485366272,Sat Aug 29 18:40:26 +0000 2015,It’s hard to say whether packing lists are mak...,1013187241,84,211,251,837,1,1
...,...,...,...,...,...,...,...,...,...,...,...
19995,19995,819336825231773698,Thu Jan 12 00:14:56 +0000 2017,A day without sunshine is like night.,1169875706,442,230,7,1063601,0,0
19996,19996,819334654260080640,Thu Jan 12 00:06:18 +0000 2017,"Boren's Laws: (1) When in charge, ponder. (2) ...",1169875706,442,230,7,1063601,0,0
19997,19997,819334503042871297,Thu Jan 12 00:05:42 +0000 2017,The flow chart is a most thoroughly oversold p...,1169875706,442,230,7,1063601,0,0
19998,19998,819334419374899200,Thu Jan 12 00:05:22 +0000 2017,"Ships are safe in harbor, but they were never ...",1169875706,442,230,7,1063601,0,0


# **MENTAL HEALTH DATASET PREPROCESSING**

In [8]:
data = data.drop(['Unnamed: 0'], axis=1)
data

Unnamed: 0,post_id,post_created,post_text,user_id,followers,friends,favourites,statuses,retweets,label
0,637894677824413696,Sun Aug 30 07:48:37 +0000 2015,It's just over 2 years since I was diagnosed w...,1013187241,84,211,251,837,0,1
1,637890384576778240,Sun Aug 30 07:31:33 +0000 2015,"It's Sunday, I need a break, so I'm planning t...",1013187241,84,211,251,837,1,1
2,637749345908051968,Sat Aug 29 22:11:07 +0000 2015,Awake but tired. I need to sleep but my brain ...,1013187241,84,211,251,837,0,1
3,637696421077123073,Sat Aug 29 18:40:49 +0000 2015,RT @SewHQ: #Retro bears make perfect gifts and...,1013187241,84,211,251,837,2,1
4,637696327485366272,Sat Aug 29 18:40:26 +0000 2015,It’s hard to say whether packing lists are mak...,1013187241,84,211,251,837,1,1
...,...,...,...,...,...,...,...,...,...,...
19995,819336825231773698,Thu Jan 12 00:14:56 +0000 2017,A day without sunshine is like night.,1169875706,442,230,7,1063601,0,0
19996,819334654260080640,Thu Jan 12 00:06:18 +0000 2017,"Boren's Laws: (1) When in charge, ponder. (2) ...",1169875706,442,230,7,1063601,0,0
19997,819334503042871297,Thu Jan 12 00:05:42 +0000 2017,The flow chart is a most thoroughly oversold p...,1169875706,442,230,7,1063601,0,0
19998,819334419374899200,Thu Jan 12 00:05:22 +0000 2017,"Ships are safe in harbor, but they were never ...",1169875706,442,230,7,1063601,0,0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   post_id       20000 non-null  int64 
 1   post_created  20000 non-null  object
 2   post_text     20000 non-null  object
 3   user_id       20000 non-null  int64 
 4   followers     20000 non-null  int64 
 5   friends       20000 non-null  int64 
 6   favourites    20000 non-null  int64 
 7   statuses      20000 non-null  int64 
 8   retweets      20000 non-null  int64 
 9   label         20000 non-null  int64 
dtypes: int64(8), object(2)
memory usage: 1.5+ MB


In [10]:
data.isnull().sum()

Unnamed: 0,0
post_id,0
post_created,0
post_text,0
user_id,0
followers,0
friends,0
favourites,0
statuses,0
retweets,0
label,0


In [11]:
data.duplicated().sum()

117

In [12]:
data.duplicated().sum()

117

In [13]:
data.drop_duplicates(inplace=True)

In [14]:
data.duplicated().sum()

0

# FER2013 DATASET
# *Dataset Loading*

In [17]:
data_3 = pd.read_csv('/content/fer2013.csv')
data_3

Unnamed: 0,emotion,pixels,Usage
0,0,70 80 82 72 58 58 60 63 54 58 60 48 89 115 121...,Training
1,0,151 150 147 155 148 133 111 140 170 174 182 15...,Training
2,2,231 212 156 164 174 138 161 173 182 200 106 38...,Training
3,4,24 32 36 30 32 23 19 20 30 41 21 22 32 34 21 1...,Training
4,6,4 0 0 0 0 0 0 0 0 0 0 0 3 15 23 28 48 50 58 84...,Training
...,...,...,...
4491,0,205 193 215 222 221 204 194 214 224 228 221 22...,Training
4492,3,245 189 102 200 236 219 212 211 218 243 220 19...,Training
4493,0,6 6 6 6 6 11 64 88 71 118 98 95 142 152 156 16...,Training
4494,3,181 151 51 11 3 2 1 4 2 10 10 11 10 6 9 9 10 1...,Training


# **Fer2013 Preprocessing**

In [18]:
data_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4496 entries, 0 to 4495
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   emotion  4496 non-null   int64 
 1   pixels   4496 non-null   object
 2   Usage    4495 non-null   object
dtypes: int64(1), object(2)
memory usage: 105.5+ KB


In [19]:
data_3.isnull().sum()

Unnamed: 0,0
emotion,0
pixels,0
Usage,1


In [20]:
data_3.duplicated().sum()

38

In [21]:
data_3.drop_duplicates(inplace=True)

In [22]:
data_3.duplicated().sum()

0

# **CITY TEMPERATURE DATASET**


In [26]:
city_temp = pd.read_csv('/content/city_temperature.csv', on_bad_lines='skip') # Skip lines with errors
city_temp

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
0,Africa,Algeria,,Algiers,1.0,1.0,1995.0,64.2
1,Africa,Algeria,,Algiers,1.0,2.0,1995.0,49.4
2,Africa,Algeria,,Algiers,1.0,3.0,1995.0,48.8
3,Africa,Algeria,,Algiers,1.0,4.0,1995.0,46.4
4,Africa,Algeria,,Algiers,1.0,5.0,1995.0,47.9
...,...,...,...,...,...,...,...,...
809261,Europe,Switzerland,,Zurich,6.0,24.0,2009.0,57.2
809262,Europe,Switzerland,,Zurich,6.0,25.0,2009.0,63.1
809263,Europe,Switzerland,,Zurich,6.0,26.0,2009.0,64.8
809264,Europe,Switzerland,,Zurich,6.0,27.0,2009.0,61.6


In [27]:
city_temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 809266 entries, 0 to 809265
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Region          809266 non-null  object 
 1   Country         809266 non-null  object 
 2   State           0 non-null       float64
 3   City            809265 non-null  object 
 4   Month           809264 non-null  float64
 5   Day             809264 non-null  float64
 6   Year            809264 non-null  float64
 7   AvgTemperature  809264 non-null  float64
dtypes: float64(5), object(3)
memory usage: 49.4+ MB


In [28]:
city_temp.isnull().sum()

Unnamed: 0,0
Region,0
Country,0
State,809266
City,1
Month,2
Day,2
Year,2
AvgTemperature,2


In [29]:
city_temp.duplicated().sum()

2184

In [30]:
city_temp.drop_duplicates(inplace=True)

In [31]:
city_temp.duplicated().sum()

0

Combining the three dataset

In [32]:
combined_data = pd.concat([data, city_temp, data_3], axis=1)
combined_data


Unnamed: 0,post_id,post_created,post_text,user_id,followers,friends,favourites,statuses,retweets,label,...,Country,State,City,Month,Day,Year,AvgTemperature,emotion,pixels,Usage
0,6.378947e+17,Sun Aug 30 07:48:37 +0000 2015,It's just over 2 years since I was diagnosed w...,1.013187e+09,84.0,211.0,251.0,837.0,0.0,1.0,...,Algeria,,Algiers,1.0,1.0,1995.0,64.2,0.0,70 80 82 72 58 58 60 63 54 58 60 48 89 115 121...,Training
1,6.378904e+17,Sun Aug 30 07:31:33 +0000 2015,"It's Sunday, I need a break, so I'm planning t...",1.013187e+09,84.0,211.0,251.0,837.0,1.0,1.0,...,Algeria,,Algiers,1.0,2.0,1995.0,49.4,0.0,151 150 147 155 148 133 111 140 170 174 182 15...,Training
2,6.377493e+17,Sat Aug 29 22:11:07 +0000 2015,Awake but tired. I need to sleep but my brain ...,1.013187e+09,84.0,211.0,251.0,837.0,0.0,1.0,...,Algeria,,Algiers,1.0,3.0,1995.0,48.8,2.0,231 212 156 164 174 138 161 173 182 200 106 38...,Training
3,6.376964e+17,Sat Aug 29 18:40:49 +0000 2015,RT @SewHQ: #Retro bears make perfect gifts and...,1.013187e+09,84.0,211.0,251.0,837.0,2.0,1.0,...,Algeria,,Algiers,1.0,4.0,1995.0,46.4,4.0,24 32 36 30 32 23 19 20 30 41 21 22 32 34 21 1...,Training
4,6.376963e+17,Sat Aug 29 18:40:26 +0000 2015,It’s hard to say whether packing lists are mak...,1.013187e+09,84.0,211.0,251.0,837.0,1.0,1.0,...,Algeria,,Algiers,1.0,5.0,1995.0,47.9,6.0,4 0 0 0 0 0 0 0 0 0 0 0 3 15 23 28 48 50 58 84...,Training
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809261,,,,,,,,,,,...,Switzerland,,Zurich,6.0,24.0,2009.0,57.2,,,
809262,,,,,,,,,,,...,Switzerland,,Zurich,6.0,25.0,2009.0,63.1,,,
809263,,,,,,,,,,,...,Switzerland,,Zurich,6.0,26.0,2009.0,64.8,,,
809264,,,,,,,,,,,...,Switzerland,,Zurich,6.0,27.0,2009.0,61.6,,,


# Textual Feature: post_text

In [43]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = TFBertModel.from_pretrained(model_name)

# Function to get BERT embeddings for a text
def get_bert_embeddings(text):
  inputs = tokenizer(text, return_tensors='tf', padding=True, truncation=True)
  outputs = bert_model(inputs)
  embeddings = outputs.last_hidden_state[:, 0, :]
  return embeddings

# Apply BERT to 'post_text' column
combined_data['bert_embeddings'] = combined_data['post_text'].apply(lambda x: get_bert_embeddings(x) if isinstance(x, str) else None)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

# Temporal Feature: post_created

In [44]:
# Convert 'post_created' to datetime objects
combined_data['post_created'] = pd.to_datetime(combined_data['post_created'])

# Extract temporal features (e.g., hour, day of week, month)
combined_data['hour'] = combined_data['post_created'].dt.hour
combined_data['dayofweek'] = combined_data['post_created'].dt.dayofweek
combined_data['month'] = combined_data['post_created'].dt.month

lstm_layer = LSTM(units=64)


# Image Feature: pixels

In [45]:
# Load pre-trained VGG16 model (without top layers)
vgg_model = VGG16(weights='imagenet', include_top=False)

# Function to extract VGG16 features from an image
def get_vgg16_features(pixels):
  if isinstance(pixels, str):
    # Convert pixel string to numpy array and reshape
    img_array = np.fromstring(pixels, dtype=int, sep=' ').reshape((48, 48, 1))
    img_array = np.repeat(img_array, 3, axis=2)  # Convert to 3 channels
    img_array = img_array.astype('float32') / 255.0  # Normalize

    # Resize image to match VGG16 input size
    img_array = tf.image.resize(img_array, [224, 224])

    # Expand dimensions to create a batch of size 1
    img_array = np.expand_dims(img_array, axis=0)

    # Extract features
    features = vgg_model.predict(img_array)
    features = Flatten()(features)
    return features
  else:
    return None

# Apply VGG16 to 'pixels' column
combined_data['vgg16_features'] = combined_data['pixels'].apply(get_vgg16_features)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
