<a href="https://colab.research.google.com/github/KhizarAziz/Test_Solution/blob/main/Innovative_Sol_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
# from pathlib import Path
import random
import matplotlib.pyplot as plt
import os
import cv2

# **Setup Datasets**

In [None]:
#Vision
!gdown --id 1Gn8A2bfGK80JlYz9IU6GEWQP1NT8Jjgc
!unzip -q dataset.zip # unzip zip file

In [None]:
# NLP
!gdown --id 19YsuFeoRQI3CwEV5VvCBWhR5C9y_3xWW
!gdown --id 1v-2WODjtFI6QL1XIiGqKr4u82466lRGu

# **Vision**



> ## **Imports**



In [None]:
from keras.layers import Input,Conv2D,BatchNormalization,ReLU,AveragePooling2D,GlobalAveragePooling2D,Dense,Dropout,multiply
from keras.models import Model
# from keras import regularizers
from keras.optimizers import Adam
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.applications.mobilenet import MobileNet
# from keras.callbacks import ReduceLROnPlateau



> ## **Load & Preprocess Data**



In [None]:
#paths
train_dir = '/content/dataset/training_set/'
test_dir = '/content/dataset/test_set'

In [None]:
#params
input_shape = (224,224,3)
dropout = 0.2
batch_size = 32
all_categories = [dirname for dirname in os.listdir(train_dir)]
out_categories = len(all_categories)

In [None]:
#functions
def get_dataset(base_dir):
  onlyfiles = []
  for dirpath, dirnames, filenames in os.walk(base_dir):
    for filename in [f for f in filenames if f.endswith(".jpg")]:
      onlyfiles.append([os.path.join(dirpath,filename),dirpath.split('/')[-1]])
  random.shuffle(onlyfiles) # generalize better
  return onlyfiles

def data_generator(onlyfiles,img_shape,batch_size):
  df_count = len(onlyfiles)
  while True:
    start = 0
    while start+batch_size < df_count:
      current_batch = onlyfiles[start:start+batch_size] # fetching a sub_df, which is our batch
      #load imgs, normalize & create a list
      img_List = []
      train_labels = [] # list for 2_point_rep of ages
      for item in current_batch: #iterate over batch to load & transform each img
        img = cv2.imread(item[0])
        ss = np.min(img.shape[0:2])
        img = img[0:ss,0:ss] # crop_square
        img = cv2.resize(img,img_shape[0:2])
        img = img/255 # normalize
        img_List.append(img)
        
        # labels encoding
        label_id = all_categories.index(item[1])
        label_enc = to_categorical(label_id,len(all_categories))
        train_labels.append(label_enc)

      img_np = np.array(img_List) 
      labels_np = np.array(train_labels)

      yield img_np, labels_np # return batch
      start += batch_size # update start point, for next batch

def get_testset(onlyfiles,img_shape):
  imgs = []
  labels = []
  for item in onlyfiles:
    img = cv2.imread(item[0])
    # ss = np.min(img.shape[0:2])
    # img = img[0:ss,0:ss] # crop_square
    img = cv2.resize(img,img_shape[0:2])
    img = img/255 # normalize
    imgs.append(img)
    
    # labels encoding
    label_id = all_categories.index(item[1])
    label_enc = to_categorical(label_id,len(all_categories))
    labels.append(label_enc)
  img_np = np.array(imgs) 
  labels_np = np.array(labels)  
  return img_np,labels_np

In [None]:
# train & val split
dataset = get_dataset(train_dir)
trainset, valset = train_test_split(dataset, train_size=0.8, test_size=0.2, random_state=5)
train_gen = data_generator(trainset,input_shape ,batch_size)
val_gen = data_generator(valset,input_shape ,batch_size)

# testset
testset = get_dataset(test_dir)
test_imgs,test_labels = get_testset(testset,input_shape)



> ## **Training & Evaluate**



In [None]:
# MODEL 
model = MobileNet(include_top=False,weights='imagenet',input_shape=input_shape)
m = GlobalAveragePooling2D()(model.output)
m = Dense(128,activation='relu')(m)
m_out = Dense(out_categories,activation='softmax')(m)
model = Model(inputs=[model.input],outputs=[m_out])

In [None]:
#COMPILE
lr = 0.001
adam = Adam(lr=lr)
model.compile(
    optimizer=adam,
    loss = 'binary_crossentropy',
    metrics='accuracy'
)

In [None]:
epochs = 15
history = model.fit(train_gen,steps_per_epoch=len(trainset) / batch_size, epochs=epochs,validation_data=val_gen,  validation_steps=len(valset) / batch_size * 3)

In [None]:
plt.plot(history.history['accuracy'])
plt.grid(axis='both')
plt.plot(history.history['val_accuracy'])
plt.legend(['accuracy', 'val_accuracy'], loc='upper left')
plt.show()



> ## **Inference**



In [None]:
# prediction
p = model.evaluate(test_imgs,test_labels)

In [None]:
print(f'Loss: {p[0]} -  Accuracy: {round(p[1]*100,3)}%')

# **NLP**



> ## **Imports**



In [None]:
import nltk
import string
import re
import keras.backend as K
from sklearn.model_selection import train_test_split
from keras.layers import Embedding,Dense,Dropout,LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import pandas as pd
pd.set_option('display.max_colwidth',100)

> ## **Load & Preprocess Data**

In [None]:
train_df = pd.read_csv('/content/twitter_train.csv',encoding = "ISO-8859-1")
train_df = train_df[['Sentiment','OriginalTweet']]
test_df = pd.read_csv('/content/twitter_test.csv',encoding = "ISO-8859-1")

In [None]:
#CREATING LABELS
def create_labels(data_df):
  all_categories = data_df['Sentiment'].unique()
  out_categories = len(all_categories)
  labels = []
  for i in data_df['Sentiment']:
    label_id = np.where(all_categories == i)
    label_enc = to_categorical(label_id,out_categories)
    labels.append(label_enc[0])
  return labels

labels = np.array(create_labels(train_df))
test_labels = np.array(create_labels(test_df))

In [None]:
#VALIDATIO SPLIT
x_train,x_val,y_train,y_val = train_test_split(train_df['OriginalTweet'],labels,test_size=0.2)

In [None]:
# initialize and fit tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

In [None]:
#use tokenizer to trnsfrm txt msgz in training and test sets
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

In [None]:
# add padding to equalize size of each tweet
x_train_seq_padded = pad_sequences(x_train_seq,60)
x_test_seq_padded = pad_sequences(x_test_seq,60)

> ## **Training & Evaluation**

In [None]:
model = Sequential()
model.add(Embedding(len(tokenizer.index_word)+1,32)) # Creating vectors (vectorization inside model) of length 32
model.add(LSTM(32,dropout=0,recurrent_dropout=0)) # type of rnn
model.add(Dense(32,activation='relu'))
model.add(Dense(5,activation='sigmoid'))
# model.summary()

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics = ['accuracy',precision_m,f1_m]
)

In [None]:
history = model.fit(x_train_seq_padded,y_train,batch_size=32,epochs=10,
          validation_data=(x_test_seq_padded,y_test))

In [None]:
plt.plot(history.history['precision_m'])
plt.grid(axis='both')
plt.plot(history.history['val_precision_m'])
plt.legend(['precision_m', 'val_precision_m'], loc='upper left')
plt.show()