## Name: Ijaz Ahmad
## ID: I19-1873

# Text Data

In [1]:
import re
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
from sklearn.metrics import classification_report, f1_score

In [4]:
data = pd.read_csv("labels.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,image_name,text_ocr,text_corrected,overall_sentiment
0,0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,very_positive
1,1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,The best of #10 YearChallenge! Completed in le...,very_positive
2,2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,Sam Thorne @Strippin ( Follow Follow Saw every...,positive
3,3,image_4.png,10 Year Challenge - Sweet Dee Edition,10 Year Challenge - Sweet Dee Edition,positive
4,4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,neutral


In [5]:
df = data[['text_corrected', 'overall_sentiment']]

### mapping each sentiment with respective value

In [6]:
order={"very_positive":1,"positive":1,"neutral":0,"negative":-1,"very_negative":-1}
df['overall_sentiment_label'] = df['overall_sentiment'].map(order)

### spliting the dataset into two parts

In [7]:
X = df['text_corrected']
y = df['overall_sentiment_label']

In [8]:
X.shape, y.shape

((6992,), (6992,))

### cleaning Training DataSet 'X'

In [9]:
cleaned_Data = []
stem = WordNetLemmatizer()
stopwords_nltk = set(stopwords.words("english"))

for sen in range(len(X)):
    text = str(X[sen])
    words = text.split(" ")
    alpha = [word for word in words if word.isalpha()]
    lower = [word.lower() for word in alpha]
    cleanedwords = [word for word in lower if word not in stopwords_nltk]
    stemWords = [stem.lemmatize(word) for word in cleanedwords]
    cleanText = ' '.join(stemWords)
    cleaned_Data.append(cleanText)

In [10]:
len(cleaned_Data)

6992

### vectorizer

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(smooth_idf = False)
X = tfidfconverter.fit_transform(cleaned_Data).toarray()

### divide the dataset into training and testing  

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

### MultinomialNB

In [13]:
from sklearn.naive_bayes import MultinomialNB
model1 = MultinomialNB(alpha = .01)
model1.fit(X_train, y_train)
y_predict = model1.predict(X_test)
print("F1 Score: ",f1_score(y_test, y_predict,average='macro'))
print("Classification Report: \n", classification_report(y_test, y_predict))

F1 Score:  0.3123758950513804
Classification Report: 
               precision    recall  f1-score   support

          -1       0.10      0.07      0.08       131
           0       0.29      0.19      0.23       462
           1       0.56      0.70      0.62       806

    accuracy                           0.47      1399
   macro avg       0.32      0.32      0.31      1399
weighted avg       0.43      0.47      0.44      1399



# BaggingClassifier

In [14]:
from sklearn.ensemble import BaggingClassifier
model2 = BaggingClassifier()
model2.fit(X_train, y_train)
pred = model2.predict(X_test)
print("F1 Score: ",f1_score(y_test, pred,average='macro'))
print("Classification Report: \n", classification_report(y_test, pred))

F1 Score:  0.3140337821830959
Classification Report: 
               precision    recall  f1-score   support

          -1       0.09      0.02      0.03       131
           0       0.32      0.23      0.27       462
           1       0.57      0.75      0.65       806

    accuracy                           0.51      1399
   macro avg       0.33      0.33      0.31      1399
weighted avg       0.45      0.51      0.46      1399



# ExtraTreesClassifier

In [15]:
from sklearn.ensemble import ExtraTreesClassifier
model3 =  ExtraTreesClassifier(random_state=70)
model3.fit(X_train, y_train)
pred3 = model3.predict(X_test)
print("F1 Score: ",f1_score(y_test, pred3,average='macro'))
print("Classification Report: \n", classification_report(y_test, pred3))

F1 Score:  0.3181382467973178
Classification Report: 
               precision    recall  f1-score   support

          -1       0.18      0.03      0.05       131
           0       0.32      0.18      0.23       462
           1       0.58      0.81      0.67       806

    accuracy                           0.53      1399
   macro avg       0.36      0.34      0.32      1399
weighted avg       0.46      0.53      0.47      1399



In [16]:
# from sklearn.ensemble import VotingClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.svm import SVC

# clf1 = LogisticRegression(random_state=42)
# clf2 = RandomForestClassifier(random_state=42)
# clf3 = GaussianNB()
# clf4 = SVC(probability=True, random_state=42)

# eclf = VotingClassifier(estimators=[('LR', clf1), ('RF', clf2), ('GNB', clf3), ('SVC', clf4)],
#                         voting='soft', weights=[1,2,1,1])

# eclf.fit(X_train, y_train)

In [17]:
# p = eclf.predict(X_test)
# print("F1 Score: ",f1_score(y_test, p,average='macro'))
# print("Classification Report: \n", classification_report(y_test, p))

In [18]:
# example of evaluating a stacking ensemble for classification
# from numpy import mean
# from numpy import std
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import RepeatedStratifiedKFold
# from sklearn.ensemble import StackingClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.linear_model import LogisticRegression

# models = [('knn', KNeighborsClassifier()), ('tree', DecisionTreeClassifier())]
# model = StackingClassifier(models, final_estimator=LogisticRegression(), cv=3)
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# # report ensemble performance
# print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# Voting between Text model

In [19]:
from sklearn.ensemble import VotingClassifier

In [20]:
clfv1 = VotingClassifier(estimators=[('NB', model1),('BC', model2),('AB', model3)], voting='hard')

In [21]:
clfv1.fit(X_train, y_train)
clfv1_Predictions = clfv1.predict(X_test)

print("F1 Score: ",f1_score(y_test, clfv1_Predictions, average='macro'))
print("Classification Report: \n", classification_report(y_test, clfv1_Predictions))

F1 Score:  0.31998213612248705
Classification Report: 
               precision    recall  f1-score   support

          -1       0.15      0.05      0.07       131
           0       0.32      0.16      0.22       462
           1       0.58      0.80      0.67       806

    accuracy                           0.52      1399
   macro avg       0.35      0.34      0.32      1399
weighted avg       0.45      0.52      0.47      1399



# Images data

In [22]:
import os
import pandas as pd
from skimage.io import imread, imshow
from skimage import filters
from skimage import feature
from PIL import Image
import re

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from skimage.io import imread, imshow

In [24]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, classification_report 
from sklearn.model_selection import train_test_split

In [25]:
# path of both main and resize dir
path = r'.\images'
save_path = r'.\resized_images'

In [26]:
# if dir exsit first delete it then make a new one
# this is for multiple runs
import shutil
if os.path.exists(save_path):
    shutil.rmtree(save_path)
os.mkdir(save_path)

In [27]:
# import the dataset again
df = pd.read_csv('labels.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,image_name,text_ocr,text_corrected,overall_sentiment
0,0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,very_positive
1,1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,The best of #10 YearChallenge! Completed in le...,very_positive
2,2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,Sam Thorne @Strippin ( Follow Follow Saw every...,positive


In [28]:
# sperate the image col
names = df['image_name']

In [29]:
# map each setiment from 5 to 1
order={"very_positive":1,"positive":1,"neutral":0,"negative":-1,"very_negative":-1}
df['overall_sentiment_label'] = df['overall_sentiment'].map(order)

In [30]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,image_name,text_ocr,text_corrected,overall_sentiment,overall_sentiment_label
0,0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,very_positive,1
1,1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,The best of #10 YearChallenge! Completed in le...,very_positive,1
2,2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,Sam Thorne @Strippin ( Follow Follow Saw every...,positive,1


In [31]:
# loop over all images to resize them and save them to new-path "resize_images"
for i in range(len(names)):
    try:
        imgpath = path + "\\" + names[i]
        img = Image.open(imgpath)
        if img.mode != 'RGB':
            img = img.convert('RGB')

        img2 = img.resize((30,30))
        new_name = save_path + "\\" + names[i]        
        img2.save(new_name)
#         fd, hog_image = hog(img2, orientations=9, pixels_per_cell=(8, 8), 
#                     cells_per_block=(2, 2), visualize=True, multichannel=True)
#         features.append(fd)
    
    except:        
        print('Truncated image')
        print(imgpath)
        #features.append(np.nan)
print("Successfully done!")

Truncated image
.\images\image_5119.png
Successfully done!


In [32]:
# this function will return the soble features
# of each image
def get_sobel_features(image):
    ed_sobel = filters.sobel(image)
    return ed_sobel

In [33]:
sobel_features=[] # list to store sobel-featurs
path = r'.\resized_images'

for i in range(len(names)-1):
    imgpath = imgpath = path + "\\" + names[i]
    if i == 5118:
        print(imgpath)
        sobel_features.append(np.nan)
        continue
    
    image1 = imread(imgpath, as_gray=True)     
    features = np.array(get_sobel_features(image1))
    sobel_features.append(features)
    
print("Successfully done!")

.\resized_images\image_5119.png
Successfully done!


In [34]:
df = df.drop(5118)

In [35]:
df['Features'] = sobel_features

In [36]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,image_name,text_ocr,text_corrected,overall_sentiment,overall_sentiment_label,Features
0,0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,very_positive,1,"[[0.01462766841720581, 0.042760404753765816, 0..."
1,1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,The best of #10 YearChallenge! Completed in le...,very_positive,1,"[[0.028889749338637253, 0.11560853377964683, 0..."
2,2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,Sam Thorne @Strippin ( Follow Follow Saw every...,positive,1,"[[0.04392511466219837, 0.2409647798594005, 0.6..."


In [37]:
df1 = df[['Features', 'overall_sentiment_label']]

In [38]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6991 entries, 0 to 6991
Data columns (total 2 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Features                 6990 non-null   object
 1   overall_sentiment_label  6991 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 163.9+ KB


In [39]:
df1.dropna(axis=0, inplace=True)
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6990 entries, 0 to 6991
Data columns (total 2 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Features                 6990 non-null   object
 1   overall_sentiment_label  6990 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 163.8+ KB


In [40]:
fd = df1['Features']
dum = pd.DataFrame() # make a dummy DataFrame to reshize the features
for i in range(len(fd)):
    temp = pd.DataFrame(fd.iloc[i].reshape(1,900))
    dum = dum.append(temp)
print("Successfully done!")

Successfully done!


In [41]:
dum.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,890,891,892,893,894,895,896,897,898,899
0,0.014628,0.04276,0.030482,0.002168,0.017837,0.055762,0.160481,0.197884,0.203825,0.172458,...,0.09047,0.077351,0.097626,0.118565,0.107398,0.110816,0.12536,0.119153,0.087213,0.048821
0,0.02889,0.115609,0.094512,0.026434,0.016477,0.013112,0.062954,0.051856,0.03641,0.006343,...,0.006413,0.014226,0.004431,0.00834,0.00357,0.000794,0.00873,0.010398,0.005689,0.010577
0,0.043925,0.240965,0.618214,0.321785,0.129623,0.143364,0.105365,0.076359,0.085911,0.077154,...,0.011979,0.031001,0.020096,0.005583,0.003745,0.011316,0.124373,0.373182,0.244877,0.004716


In [42]:
X = dum
y = df1['overall_sentiment_label']

In [43]:
X.shape, y.shape

((6990, 900), (6990,))

In [44]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2, random_state=101)

In [45]:
from sklearn.linear_model import LogisticRegression
clf1 = LogisticRegression()
clf1.fit(X_train2, y_train2)
pred2 = clf1.predict(X_test2)
print("F1 Score: ",f1_score(y_test2, pred2, average='macro'))
print("Classification Report: \n", classification_report(y_test2, pred2))

F1 Score:  0.32103192095289423
Classification Report: 
               precision    recall  f1-score   support

          -1       0.05      0.01      0.01       125
           0       0.36      0.20      0.25       452
           1       0.60      0.83      0.70       821

    accuracy                           0.55      1398
   macro avg       0.34      0.34      0.32      1398
weighted avg       0.47      0.55      0.49      1398



In [46]:
from sklearn.tree import DecisionTreeClassifier
clf2 = DecisionTreeClassifier()
clf2.fit(X_train2, y_train2)
pred3 = clf2.predict(X_test2)
print("F1 Score: ",f1_score(y_test2, pred3, average='macro'))
print("Classification Report: \n", classification_report(y_test2, pred3))

F1 Score:  0.3441731607712215
Classification Report: 
               precision    recall  f1-score   support

          -1       0.09      0.10      0.10       125
           0       0.34      0.34      0.34       452
           1       0.60      0.59      0.60       821

    accuracy                           0.46      1398
   macro avg       0.34      0.34      0.34      1398
weighted avg       0.47      0.46      0.47      1398



In [47]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(X_train2, y_train2)
pred = clf.predict(X_test2)
print("F1 Score: ",f1_score(y_test2, pred, average='macro'))
print("Classification Report: \n", classification_report(y_test2, pred))

F1 Score:  0.3333030585865742
Classification Report: 
               precision    recall  f1-score   support

          -1       0.08      0.03      0.04       125
           0       0.35      0.25      0.29       452
           1       0.60      0.75      0.67       821

    accuracy                           0.52      1398
   macro avg       0.34      0.34      0.33      1398
weighted avg       0.47      0.52      0.49      1398



In [48]:
from sklearn.ensemble import VotingClassifier

In [49]:
clfv2 = VotingClassifier(estimators=[('RF', clf1),('DT', clf2),('KNN', clf)], voting='hard')

In [50]:
clfv2.fit(X_train2, y_train2)
clfv2_Predictions = clfv2.predict(X_test2)

print("F1 Score: ",f1_score(y_test2, clfv2_Predictions, average='macro'))
print("Classification Report: \n", classification_report(y_test2, clfv2_Predictions))

F1 Score:  0.33247070044277155
Classification Report: 
               precision    recall  f1-score   support

          -1       0.09      0.05      0.06       125
           0       0.38      0.17      0.23       452
           1       0.60      0.83      0.70       821

    accuracy                           0.55      1398
   macro avg       0.36      0.35      0.33      1398
weighted avg       0.49      0.55      0.49      1398



# Now Voting between Image final and Text final

In [51]:
from sklearn.ensemble import VotingClassifier

In [52]:
final = VotingClassifier(estimators=[('IM', clfv2),('TX', clfv1)], voting='hard')

In [53]:
final.fit(X_train2, y_train2)
final_Predictions = final.predict(X_test2)

print("F1 Score: ",f1_score(y_test2, final_Predictions, average='macro'))
print("Classification Report: \n", classification_report(y_test2, final_Predictions))

F1 Score:  0.33498193371516566
Classification Report: 
               precision    recall  f1-score   support

          -1       0.10      0.06      0.07       125
           0       0.37      0.19      0.25       452
           1       0.60      0.80      0.69       821

    accuracy                           0.54      1398
   macro avg       0.36      0.35      0.33      1398
weighted avg       0.48      0.54      0.49      1398



## save pickle

In [54]:
import pickle
Pkl_Filename = "final_result.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(final, file)