<a href="https://colab.research.google.com/github/KhizarAziz/Test_Solution/blob/main/Innovative_Sol_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
# from pathlib import Path
import random
import matplotlib.pyplot as plt
import os
import cv2

# **Setup Datasets**

In [None]:
#Vision
!gdown --id 1Gn8A2bfGK80JlYz9IU6GEWQP1NT8Jjgc
!unzip -q dataset.zip # unzip zip file

In [None]:
# NLP
!gdown --id 19YsuFeoRQI3CwEV5VvCBWhR5C9y_3xWW
!gdown --id 1v-2WODjtFI6QL1XIiGqKr4u82466lRGu

# **Vision**



> ## **Imports**



In [None]:
from keras.layers import Input,Conv2D,BatchNormalization,ReLU,AveragePooling2D,GlobalAveragePooling2D,Dense,Dropout,multiply
from keras.models import Model
# from keras import regularizers
from keras.optimizers import Adam
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.applications.mobilenet import MobileNet
# from keras.callbacks import ReduceLROnPlateau



> ## **Load & Preprocess Data**



In [None]:
#paths
train_dir = '/content/dataset/training_set/'
test_dir = '/content/dataset/test_set'

In [None]:
#params
input_shape = (224,224,3)
dropout = 0.2
batch_size = 32
all_categories = [dirname for dirname in os.listdir(train_dir)]
out_categories = len(all_categories)

In [None]:
#functions
def get_dataset(base_dir):
  onlyfiles = []
  for dirpath, dirnames, filenames in os.walk(base_dir):
    for filename in [f for f in filenames if f.endswith(".jpg")]:
      onlyfiles.append([os.path.join(dirpath,filename),dirpath.split('/')[-1]])
  random.shuffle(onlyfiles) # generalize better
  return onlyfiles

def data_generator(onlyfiles,img_shape,batch_size):
  df_count = len(onlyfiles)
  while True:
    start = 0
    while start+batch_size < df_count:
      current_batch = onlyfiles[start:start+batch_size] # fetching a sub_df, which is our batch
      #load imgs, normalize & create a list
      img_List = []
      train_labels = [] # list for 2_point_rep of ages
      for item in current_batch: #iterate over batch to load & transform each img
        img = cv2.imread(item[0])
        ss = np.min(img.shape[0:2])
        img = img[0:ss,0:ss] # crop_square
        img = cv2.resize(img,img_shape[0:2])
        img = img/255 # normalize
        img_List.append(img)
        
        # labels encoding
        label_id = all_categories.index(item[1])
        label_enc = to_categorical(label_id,len(all_categories))
        train_labels.append(label_enc)

      img_np = np.array(img_List) 
      labels_np = np.array(train_labels)

      yield img_np, labels_np # return batch
      start += batch_size # update start point, for next batch

def get_testset(onlyfiles,img_shape):
  imgs = []
  labels = []
  for item in onlyfiles:
    img = cv2.imread(item[0])
    # ss = np.min(img.shape[0:2])
    # img = img[0:ss,0:ss] # crop_square
    img = cv2.resize(img,img_shape[0:2])
    img = img/255 # normalize
    imgs.append(img)
    
    # labels encoding
    label_id = all_categories.index(item[1])
    label_enc = to_categorical(label_id,len(all_categories))
    labels.append(label_enc)
  img_np = np.array(imgs) 
  labels_np = np.array(labels)  
  return img_np,labels_np

In [None]:
# train & val split
dataset = get_dataset(train_dir)
trainset, valset = train_test_split(dataset, train_size=0.8, test_size=0.2, random_state=5)
train_gen = data_generator(trainset,input_shape ,batch_size)
val_gen = data_generator(valset,input_shape ,batch_size)

# testset
testset = get_dataset(test_dir)
test_imgs,test_labels = get_testset(testset,input_shape)



> ## **Training**



In [None]:
# MODEL 
model = MobileNet(include_top=False,weights='imagenet',input_shape=input_shape)
m = GlobalAveragePooling2D()(model.output)
m = Dense(128,activation='relu')(m)
m_out = Dense(out_categories,activation='softmax')(m)
model = Model(inputs=[model.input],outputs=[m_out])

In [None]:
#COMPILE
lr = 0.001
adam = Adam(lr=lr)
model.compile(
    optimizer=adam,
    loss = 'binary_crossentropy',
    metrics='accuracy'
)

In [None]:
epochs = 15
history = model.fit(train_gen,steps_per_epoch=len(trainset) / batch_size, epochs=epochs,validation_data=val_gen,  validation_steps=len(valset) / batch_size * 3)

In [None]:
plt.plot(history.history['accuracy'])
plt.grid(axis='both')
plt.plot(history.history['val_accuracy'])
plt.legend(['accuracy', 'val_accuracy'], loc='upper left')
plt.show()



> ## **Evaluate**



In [None]:
# prediction
p = model.evaluate(test_imgs,test_labels)

In [None]:
print(f'Loss: {p[0]} -  Accuracy: {round(p[1]*100,3)}%')

# **NLP**



> ## **Imports**



In [None]:
import nltk
import string
import re
import keras.backend as K
from sklearn.model_selection import train_test_split
from keras.layers import Embedding,Dense,Dropout,LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import pandas as pd
pd.set_option('display.max_colwidth',100)

> ## **Load & Preprocess Data**

In [None]:
train_df = pd.read_csv('/content/twitter_train.csv',encoding = "ISO-8859-1")
train_df = train_df[['Sentiment','OriginalTweet']]
test_df = pd.read_csv('/content/twitter_test.csv',encoding = "ISO-8859-1")

In [None]:
#CREATING LABELS
all_categories = train_df['Sentiment'].unique()
out_categories = len(all_categories)
labels = []
for i in train_df['Sentiment']:
  label_id = np.where(all_categories == i)
  label_enc = to_categorical(label_id,out_categories)
  labels.append(label_enc[0])
labels = np.array(labels)

In [None]:
#VALIDATIO SPLIT
x_train,x_test,y_train,y_test = train_test_split(train_df['OriginalTweet'],labels,test_size=0.2)

In [None]:
# initialize and fit tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

In [None]:
#use tokenizer to trnsfrm txt msgz in train and test sets
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

In [None]:
# int representation of first tweet
len(x_train_seq[0])

In [None]:
# add padding to equalize size of each tweet
x_train_seq_padded = pad_sequences(x_train_seq,80)
x_test_seq_padded = pad_sequences(x_test_seq,80)

> ## **Training**

In [None]:
model = Sequential()
model.add(Embedding(len(tokenizer.index_word)+1,32)) # Creating vectors (vectorization inside model) of length 32
model.add(LSTM(32,dropout=0,recurrent_dropout=0)) # type of rnn
model.add(Dense(32,activation='relu'))
model.add(Dense(5,activation='sigmoid'))
# model.summary()

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics = ['accuracy',precision_m,recall_m]
)

In [None]:
history = model.fit(x_train_seq_padded,y_train,batch_size=32,epochs=10,
          validation_data=(x_test_seq_padded,y_test))

In [None]:
plt.plot(history.history['precision_m'])
plt.grid(axis='both')
plt.plot(history.history['val_precision_m'])
plt.legend(['precision_m', 'val_precision_m'], loc='upper left')
plt.show()



> ## **Load & Preprocess Data**





> ## **Training**





> ## **Evaluate**



In [None]:
# prediction
p = model.evaluate(test_imgs,test_labels)

In [None]:
print(f'Loss: {p[0]} -  Accuracy: {round(p[1]*100,3)}%')

# **LOAD AND CLEAN DATA**

In [None]:
import nltk
import string
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
import pandas as pd
pd.set_option('display.max_colwidth',100)

In [None]:
train_df = pd.read_csv('/content/twitter_train.csv',encoding = "ISO-8859-1")
train_df = train_df[['Sentiment','OriginalTweet']]
test_df = pd.read_csv('/content/twitter_test.csv',encoding = "ISO-8859-1")
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

In [None]:
trainset, testset = train_test_split(train_df, train_size=0.7, test_size=0.3, random_state=42)

In [None]:
def clean_text(text):
  # remove punctuation
  text_nopunct = "".join([char for char in text if char not in string.punctuation])
  # tokenize
  tokens = re.findall('\w+',text_nopunct) #re.split('\W+',text_nopunct)
  # remove stop words
  no_stopwords = [word for word in tokens if word not in stopwords]
  #LEMMATIZINNG (slow but more accurate)
  lemmatized = [wn.lemmatize(word) for word in no_stopwords]
  #stemmatized = [ps.stem(word) for word in no_stopwords]
  return lemmatized
trainset['CleanedTweet'] = trainset['OriginalTweet'].apply(lambda x: clean_text(x.lower()))

In [None]:
# vectorizing data
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer = clean_text) # clean_text() defined above
X_tfidf = tfidf_vect.fit_transform(trainset['OriginalTweet'])
print('shape of df will be ',X_tfidf.shape) 
# print('columns names are',tfidf_vect.get_feature_names())

In [None]:
x_train = pd.DataFrame(X_tfidf.toarray())
y_train = trainset['Sentiment']

In [None]:
x_train_np = x_train.to_numpy()

# **MODEL**

In [None]:
from keras.layers import Embedding,Flatten, Dense
from keras.models import Sequential
from keras.optimizers import Adam

In [None]:
vocab_size = 5000
embedding_dim = 50
seq_len = 500
#MODEL
dense_model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=seq_len),
    Flatten(),
    Dense(1, activation='relu')])

In [None]:
dense_model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
dense_model.summary()

In [None]:
dense_model.fit(x_train_np, y_train, validation_data=(x_train_np, y_train), epochs=5, batch_size=64)
# dense_model.save_weights("model/dense.h5")

In [None]:
x_train.shape, y_train.shape

In [None]:
# x_train, x_test, y_train, y_test = train_test_split(pd.DataFrame(X_tfidf.toarray()),train_df['Sentiment'],test_size=0.2)

In [None]:
# X_features = pd.DataFrame(X_tfidf.toarray())

In [None]:
x

# **playground**

In [None]:
import gensim
import gensim.downloader as api
wiki_embeddings = api.load('glove-wiki-gigaword-100')

In [None]:
wiki_embeddings['king']

In [None]:
wiki_embeddings.most_similar('king')

In [None]:
trainset.head()

In [None]:
train_df['gensim_cleaned'] = train_df['OriginalTweet'].apply(lambda x: gensim.utils.simple_preprocess(x))

In [None]:
train_test_split(trainset)

In [None]:
x_train, x_test, y_train,ytest = train_test_split(train_df['gensim_cleaned'],
                train_df['Sentiment'],test_size =0.2)

In [None]:
#training our own wordtovec
w2v_model = gensim.models.Word2Vec(x_train, #convert this to vectors
                                   size=100, # size of vector we want
                                   window=5, # window of context words to consider at once, like in tut
                                   min_count=2) # number of times word needs to atleast appear in our corpus of data.

In [None]:
w2v_model.wv['king']

In [None]:
w2v_model.wv.most_similar('happy')

In [None]:
w2v_model.wv.index2word

In [None]:
x_train

In [None]:
# count length of tweet (probably not useful)... anyways!
train_df['AlphabetCount'] = train_df['OriginalTweet'].apply(lambda x: len(x) - x.count(' '))

In [None]:
# feature engineering
def count_punct(text):
  total_puncts = sum([1 for char in text if char in string.punctuation])
  return round((total_puncts * 100) / len(text),3)
train_df['PunctutationCount'] = train_df['OriginalTweet'].apply(lambda x: count_punct(x))

In [None]:
bins = np.linspace(0,800,40)
plt.hist([1,2,3,4],)
plt.hist(train_df[train_df['Sentiment'] == 'Neutral']['AlphabetCount'],bins,alpha= 0.5,  label='Neutral')
plt.hist(train_df[train_df['Sentiment'] == 'Positive']['AlphabetCount'],bins,alpha= 0.5,  label='Positive')
plt.hist(train_df[train_df['Sentiment'] == 'Extremely Negative']['AlphabetCount'],bins,alpha= 0.5,  label='Extremely Negative')
plt.hist(train_df[train_df['Sentiment'] == 'Negative']['AlphabetCount'],bins,alpha= 0.5,  label='Negative')
plt.hist(train_df[train_df['Sentiment'] == 'Extremely Positive']['AlphabetCount'],bins,alpha= 0.5,  label='Extremely Positive')
plt.legend(loc='upper left')
plt.show()

In [None]:
train_df.head(30)

In [None]:
# string.punctuation in 'i am khizer.'
sum([1 for char in 'i am.  5 &#$%^ khizer.' if char in string.punctuation])

In [None]:
#tokenization (findall(), split())
s = 'helo is isss  ,  :: """"  my name'
re.findall('\w+',s)

In [None]:
s = 'I follow PEP8, pep9 & pep 8 Guidlines'
re.findall('[A-Za-z/t]+[0-9]+',s)

In [None]:
s = 'I follow PEP8 Guidlines & i was born in 1992 when i was the leader in 123'
re.findall('[0-9]+',s)

In [None]:
train_df.head()

In [None]:
#LEMMATIZINNG (slow but more accurate)
wn = nltk.WordNetLemmatizer()
def lemmatizing(tokenized_text):
  text = [wn.lemmatize(word) for word in tokenized_text]
  return text

train_df['lemmatized_tweets'] = train_df['cleaned_text'].apply(lambda x: lemmatizing(x))

In [None]:
#STEMMING (more faster than lemmizer)
ps = nltk.PorterStemmer()
def stemming(tokenized_text):
  text = [ps.stem(word) for word in tokenized_text]
  return text

train_df['stemmed_tweets'] = train_df['cleaned_text'].apply(lambda x: stemming(x))

In [None]:
print(ps.stem('meaning'))
print(ps.stem('meanness'))
print(wn.lemmatize('meaning'))
print(wn.lemmatize('meanness'))

In [None]:
print(ps.stem('goose'))
print(ps.stem('geese'))
print(wn.lemmatize('goose'))
print(wn.lemmatize('geese'))

In [None]:
train_df.head(20)

In [None]:
# REMOVE PUNCTUATIONS

def remove_punct(text):
  text_nopunct = "".join([char for char in text if char not in string.punctuation])
  return text_nopunct

train_df['OriginalTweet_clean'] = train_df['OriginalTweet'].apply(lambda x: remove_punct(x))

In [None]:
# TOKENIZATION
def tokenize(text):
  tokens = re.split('\W+',text)
  return tokens

train_df['OriginalTweet_tokenized'] = train_df['OriginalTweet_clean'].apply(lambda x: tokenize(x.lower()))

In [None]:
# REMOVE STOP WORDS
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(tokenized_list):
  text = [word for word in tokenized_list if word not in stopwords]
  return text

train_df['not_stop_words'] = train_df['OriginalTweet_tokenized'].apply(lambda x: remove_stopwords(x))

In [None]:
#STEMMING
ps = nltk.PorterStemmer()
def stemming(tokenized_text):
  text = [word for word in tokenized_text]
  return text

train_df['stemmed_tweets'] = train_df['OriginalTweet_tokenized'].apply(lambda x: stemming(x))

In [None]:
train_df.head()

In [None]:
train_df.iloc[1:2]

In [None]:
print(ps.stem("grow"))
print(ps.stem("grows"))
print(ps.stem("growing"))
print(ps.stem("grown"))

In [None]:
print(ps.stem("mean"))
print(ps.stem("meanness"))
print(ps.stem("meaning"))
print(ps.stem("meaningless"))

In [None]:
print(ps.stem("run"))
print(ps.stem("running"))
print(ps.stem("runner"))

In [None]:
train_df

In [None]:
stopwords.words()

In [None]:
nltk.download('stopwords')

In [None]:
ind = 12
batch = next(iter(train_gen))
print(batch[1][ind])
img = batch[0][ind]
plt.imshow(batch[0][ind])