In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
trainset = pd.read_csv("E:\\programs\\python\\NLP\\trainset.csv", delimiter=',', encoding='utf-8')
validationset = pd.read_csv("E:\\programs\\python\\NLP\\validationset.csv", delimiter=',', encoding='utf-8')

In [3]:
# display complete contents of a dataframe without any kind of truncation
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.width',None)
pd.set_option('display.max_colwidth',-1)

In [4]:
print(trainset.head(10)) # print 10 first rows

   Unnamed: 0  Unnamed: 0.1  archive_by_user                   brand  \
0  0           282086        True             NaN                      
1  1           762753        True             Samsung::سامسونگ         
2  2           805240        True             NaN                      
3  3           556730        False            NaN                      
4  4           727332        True             NaN                      
5  5           805039        True             NaN                      
6  6           812617        False            NaN                      
7  7           295730        True             NaN                      
8  8           777605        False            نیسان::Nissan            
9  9           797079        False            پراید صندوق‌دار::Pride   

                 cat1                       cat2            cat3     city  \
0  personal            clothing-and-shoes         shoes-belt-bag  Tehran    
1  electronic-devices  mobile-tablet              mob

In [5]:
train_len = trainset['Unnamed: 0'].count() #the first row is the topics so we got 700000 data
valid_len = validationset['Unnamed: 0'].count()  #the first row is the topics so we got 147635 data

In [6]:
print(train_len) #printing the data rows in trainset
print(valid_len) #printing the data rows in validationset

700000
147635


In [7]:
number_per_cat = trainset.groupby("cat1")["id"].count() #calculating the number of items in each category

In [8]:
number_per_cat

cat1
businesses            45660 
electronic-devices    122905
for-the-home          214955
leisure-hobbies       61676 
personal              102804
vehicles              152000
Name: id, dtype: int64

In [9]:
arr = pd.read_csv("E:\\programs\\python\\NLP\\projects\\persian.csv", sep="\n", encoding='utf-8')
stop_words = arr.values.tolist() #converting the stop words into a list
stop_words = [item for sublist in stop_words for item in sublist] #converting stop words into a 1d list

In [159]:
descriptions = trainset['desc']
titles = trainset['title']

In [160]:
descriptions.head(3) #printing the first 3 rows of description

0    چکمه یکبار پوشیده شده قیمت 42\nکفش قهوه ای سوخته هشتگ نو قیمت 27\nکفش گیپوری شیشه ای قیمت 25 با کفی به 39 میشه و بدون کفی به 40 و نو نو\nامکان پست\nمزاحمت مستقیما پلیس فتا
1    گوشی رو تا حالا باز نکردم و تو جعبه پلمپه از دیجی کالا برام گرفتنش با گارانتی مایروتل  قیمتش تو دیجی ۹۳۰ تومن                                                              
2    ساعت هیچ مشکلی ندارد اصل اصل هستش چون دیگه دستم نمیکنم میخوام بفروشم                                                                                                       
Name: desc, dtype: object

In [161]:
sample = descriptions.head(5) # first 5 rows of description

In [162]:
def preprocessing(text):
    """
     This method takes an unprocessed text and removes its stop words, and punctuations. It takes a list of simple texts and 
     returns a list containing words that are processed for each text.
     text: the unprocessed text
     cleared_text: the processed text
    """
    processed_description = [] #the processed descriptions would go here
    cleared_text = [] #a list to save all of the processed descriptions
    for i in range(len(text)):
        desc = text[i]
        words = desc.split(" ")
        
#     print(len(words))
        for j in range(len(words)):
            if words[j] not in stop_words:
                processed_description.append(words[j])
#     print(words)
#     print("======================================")
#     print(len(processed_description))
#         print(processed_description)
        cleared_text.append(processed_description)
#         print(cleared_text)
        processed_description = []
    return cleared_text

In [163]:
uncleared_text = [] #an example to show how the preprocessing works
for i in range(len(sample)):
    desc = sample[i]
    words = desc.split(" ")
    uncleared_text.append(words)
print(uncleared_text) #unprocessed text for the first five rows
print("========================")
cleared_text = preprocessing(sample)
print(cleared_text) #processed text for the first five rows

[['چکمه', 'یکبار', 'پوشیده', 'شده', 'قیمت', '42\nکفش', 'قهوه', 'ای', 'سوخته', 'هشتگ', 'نو', 'قیمت', '27\nکفش', 'گیپوری', 'شیشه', 'ای', 'قیمت', '25', 'با', 'کفی', 'به', '39', 'میشه', 'و', 'بدون', 'کفی', 'به', '40', 'و', 'نو', 'نو\nامکان', 'پست\nمزاحمت', 'مستقیما', 'پلیس', 'فتا'], ['گوشی', 'رو', 'تا', 'حالا', 'باز', 'نکردم', 'و', 'تو', 'جعبه', 'پلمپه', 'از', 'دیجی', 'کالا', 'برام', 'گرفتنش', 'با', 'گارانتی', 'مایروتل', '', 'قیمتش', 'تو', 'دیجی', '۹۳۰', 'تومن'], ['ساعت', 'هیچ', 'مشکلی', 'ندارد', 'اصل', 'اصل', 'هستش', 'چون', 'دیگه', 'دستم', 'نمیکنم', 'میخوام', 'بفروشم'], ['دوچرخه', 'از', 'هرلحاظ', 'سالمه', 'و', 'فقط', 'مدت', 'کوتاهی', 'استفاده', 'شده.به', 'خریدار', 'واقعی', 'تخفیف', 'هم', 'میدم'], ['14', 'اسکناس', 'مطابق', 'تصویر', 'همه', 'باهم', '200', 'هزار', 'تومان\nمناسب', 'کلکسیونر', 'و', 'مجموعه', 'دار.\nفقط', 'خریدار', 'واقعی', 'زنگ', 'بزنه', '،', 'پیامک', 'نمیتونم', 'جواب', 'بدم$NUM']]
[['چکمه', 'یکبار', 'پوشیده', 'قیمت', '42\nکفش', 'قهوه', 'سوخته', 'هشتگ', 'نو', 'قیمت', '27\nکفش',

In [164]:
cleared_descriptions = preprocessing(descriptions.head(1000)) #preprocessed descriptions
cleared_titles = preprocessing(titles.head(1000)) #preprocessed titles

In [165]:
number_of_texts = len(cleared_titles) #total number of texts in our corpus
print(number_of_texts)

1000


In [166]:
def tf_idf(text):
    """
     This method calculates the tf_idf which is tf(word, text) * idf(word). With tf_idf we represent texts in numerical forms
     text: the text in which we calculate the tf_idf for.
     returns a vector containing the tf_idf for all of the words in the list text
    """
    vector = []
    length_of_text = len(text) #the number of words in our text
    for i in range(length_of_text):
        word = text[i] #put the ith word of text into the variable word
        vector.append(tf(word, text) * idf(word, cleared_descriptions))
    return vector

In [167]:
def tf(word, text):
    """
     This method calculates the tf which is the number of times the word occurs in a text on the number of words in the text.
     word: the word we are calculating tf for
     text: the text in which we calculate the tf for word
    """
    number_of_occurances_in_text = 0 #number of times the word occurs in the text
    number_of_words_in_text = len(text) #total number of words in the text
    for i in range(number_of_words_in_text): #loop over the list text and count the number of occurances of the word
        if text[i] == word:
            number_of_occurances_in_text += 1
            
    return (number_of_occurances_in_text/number_of_words_in_text)

In [168]:
def idf(word, docs):
    """
     This method calculates the idf which is the logarithm (number of text in the corpus/number of texts where the word occurs).
     word: the word we are calculating idf for
     docs: all of our texts or documents
    """
    number_of_texts = len(docs) #total number of texts in our corpus
    number_of_texts_where_the_word_occurs = 0 #number of texts where the word occurs in the corpus
    for i in range(number_of_texts): #loop over all the documents or texts in the corpus and count the number of occurances of the word in corpus
        if word in docs[i]:
            number_of_texts_where_the_word_occurs += 1
    return (math.log((1 + number_of_texts)/(1 + number_of_texts_where_the_word_occurs)) + 1) #applying smoothing   

In [169]:
print(cleared_descriptions[2])

['ساعت', 'مشکلی', 'اصل', 'اصل', 'هستش', 'دستم', 'نمیکنم', 'میخوام', 'بفروشم']


In [170]:
print(tf('اصل', cleared_descriptions[2]))

0.2222222222222222


In [171]:
print(idf('اصل',cleared_descriptions))

4.864232341591798


In [172]:
non_numerical_title = cleared_titles # the list of processed words from the title to feed our model with
non_numerical_description = cleared_descriptions # the list of processed words from the descriptions to feed our model with
#Y is our actual values for cat1
non_numerical_Y = trainset['cat1'].head(1000) # we want to predict the cat1 with the title and description that we have for each text

In [173]:
def categorizer(category):
    """
    This method will get the list of words for categories and assigns some integer values to them instead. The integer values
    are as the following:
    businesses ==> 0
    electronic-devices ==> 1
    for-the-home ==> 2
    leisure-hobbies ==>3
    personal ==> 4
    vehicles ==> 5
    """
    classes = []
    category_length = len(category)
    for i in range(category_length):
        if category[i] == "businesses":
            classes.append(0)
            
        elif category[i] == "electronic-devices":
            classes.append(1)
            
        elif category[i] == "for-the-home":
            classes.append(2)
            
        elif category[i] == "leisure-hobbies":
            classes.append(3)
            
        elif category[i] == "personal":
            classes.append(4)
            
        else:
            classes.append(5)
            
    return classes
            

In [174]:
#in here we are going to create  a simple x for each text sample
def get_numerical_features(non_numerical_title, non_numerical_description):
    """
    This method returns the numerical features for the texts. One simple X for each text sample
    """
    numerical_title = []
    numerical_description = []
    for i in range(len(non_numerical_title)): #calculating a simple X for each text sample 
        numerical_title.append(np.mean(tf_idf(non_numerical_title[i])))
        numerical_description.append(np.mean(tf_idf(non_numerical_description[i])))
        
    return numerical_title, numerical_description

In [175]:
#A list containing numerical title and description for each text sample in the training set
titles, descriptions = get_numerical_features(non_numerical_title, non_numerical_description) 
Y = categorizer(non_numerical_Y) #A list containing an integer for the category in cat1

In [176]:
for i in range(5): #printing first 5 rows of X and their Corresponding Y
    print(titles[i]," ", descriptions[i]," " ,Y[i])

2.6935211104277985   0.30077919597371894   4
1.600572071464723   0.4815505016279709   1
1.4876878085472134   0.7793919257670185   4
0.6676011863569481   0.7048607115853699   4
1.8398621587452915   0.37883513632745447   3


In [177]:
for i in range(5): #show some of the cat1 and their classes
    print(non_numerical_Y[i]," " ,Y[i])

personal   4
electronic-devices   1
personal   4
personal   4
leisure-hobbies   3


In [178]:
# Dividing the trainset into training set and test set. We need to create a test set out of our training
# set in order to update our model. In each itertion, we create a new test set out of our trainset and will try to predict them
# to see that is our model is good enough to go against the validationset.
# After the model is created, we go against the validation set with y = mx + b
# train_test_split, Splits arrays or matrices into random train and test subsets

# X_train : the training data out of our training set, used to train our model in each iteration

# X_test : the desired outputs for the X_train, used to train our model in each iteration 

# Y_train : part of the trainset that we seperate and use to predict Y_test to see if our model is good enough. If the model still
# is not good enough, then we update m and b of the linear function to predict better in the next iteration

# Y_test : the desired outputs for the Y_train. we compare these with the prediction of our model based on Y_train to calculate accuracy.
# if the accuracy is good enough, then the model is ready and we're done! 

#the following are all lists
title_train, title_test, description_train, description_test, Y_train, Y_test = train_test_split(
        titles, descriptions, Y, test_size=0.2, random_state=42)

In [179]:
for i in range(5):
    print(title_train[i], title_test[i], description_train[i], description_test[i], Y_train[i], Y_test[i])

2.152172834584008 1.6168620465531882 0.4142020582760249 0.47727722177441445 4 2
2.1972245132626926 1.3463276148926013 0.5798361472661884 0.46696302756873126 5 0
2.6506434502553637 1.7461010254204858 1.8039018996888188 0.630088776491206 5 2
1.0201267875989601 2.3698187850163657 1.303880056231026 0.7088518356933293 5 2
2.167417403641062 1.3503612724636997 0.5001951235950429 0.44082659674614566 5 2


In [180]:
print(len(title_train)," ",len(title_test)," " ,len(description_train), " ",len(description_test), " ",len(Y_train), " ",len(Y_test))

800   200   800   200   800   200


In [181]:
def calculate_businesses(classes):
    """
    This method returns a list which contains one as an element, whenever the cat1 is business for that data and is zero otherwise
    """
    businesses = []
    classes_length = len(classes)
    for i in range(classes_length): #loop over the classes and if 0 which means the cat1 is businesses, append 1. append 0 otherwise
        if classes[i] == 0:
            businesses.append(1)
            
        else:
            businesses.append(0)
            
    return businesses


In [182]:
def calculate_electronic_devices(classes):
    """
    This method returns a list which contains one as an element, whenever the cat1 is electronic-devices for that 
    data and is zero otherwise
    """
    electronics = []
    classes_length = len(classes)
    for i in range(classes_length): #loop over the classes and if 0 which means the cat1 is businesses, append 1. append 0 otherwise
        if classes[i] == 1:
            electronics.append(1)
            
        else:
            electronics.append(0)
            
    return electronics


In [183]:
def calculate_for_the_home(classes):
    """
    This method returns a list which contains one as an element, whenever the cat1 is for-the-home 
    for that data and is zero otherwise
    """
    home = []
    classes_length = len(classes)
    for i in range(classes_length): #loop over the classes and if 0 which means the cat1 is businesses, append 1. append 0 otherwise
        if classes[i] == 2:
            home.append(1)
            
        else:
            home.append(0)
            
    return home


In [184]:
def calculate_leisure_hobbies(classes):
    """
    This method returns a list which contains one as an element, whenever the cat1 is leisure-hobbies 
    for that data and is zero otherwise
    """
    hobbies = []
    classes_length = len(classes)
    for i in range(classes_length): #loop over the classes and if 0 which means the cat1 is businesses, append 1. append 0 otherwise
        if classes[i] == 3:
            hobbies.append(1)
            
        else:
            hobbies.append(0)
            
    return hobbies


In [185]:
def calculate_personal(classes):
    """
    This method returns a list which contains one as an element, whenever the cat1 is personal 
    for that data and is zero otherwise
    """
    personal = []
    classes_length = len(classes)
    for i in range(classes_length): #loop over the classes and if 0 which means the cat1 is businesses, append 1. append 0 otherwise
        if classes[i] == 4:
            personal.append(1)
            
        else:
            personal.append(0)
            
    return personal


In [186]:
def calculate_vehicles(classes):
    """
    This method returns a list which contains one as an element, whenever the cat1 is vehicles 
    for that data and is zero otherwise
    """
    vehicles = []
    classes_length = len(classes)
    for i in range(classes_length): #loop over the classes and if 0 which means the cat1 is businesses, append 1. append 0 otherwise
        if classes[i] == 5:
            vehicles.append(1)
            
        else:
            vehicles.append(0)
            
    return vehicles


In [187]:
# These are the desired outputs for the entire trainset
y0 = calculate_businesses(Y_train) #y0 is a list which is one when the cat1 is businesses and zero otherwise
y1 = calculate_electronic_devices(Y_train) #y1 is a list which is one when the cat1 is electronic-devices and zero otherwise
y2 = calculate_for_the_home(Y_train) #y2 is a list which is one when the cat1 is for-the-home and zero otherwise
y3 = calculate_leisure_hobbies(Y_train) #y3 is a list which is one when the cat1 is leisure-hobbies and zero otherwise
y4 = calculate_personal(Y_train) #y4 is a list which is one when the cat1 is personal and zero otherwise
y5 = calculate_vehicles(Y_train) #y5 is a list which is one when the cat1 is vehicles and zero otherwise

In [188]:
for i in range(5): #printing first 5 rows of classes and their respective values in y0 to y5
    print(Y_train[i]," " , y0[i], y1[i], y2[i], y3[i], y4[i], y5[i])

4   0 0 0 0 1 0
5   0 0 0 0 0 1
5   0 0 0 0 0 1
5   0 0 0 0 0 1
5   0 0 0 0 0 1


In [189]:
# Thes are the desired outputs only for the train part of the trainset in the 6 class of categories
#We need these values in order to train our model to predict the X values in the validation set
Y_train0 = calculate_businesses(Y_train) #o0 is a list which is one when the cat1 in the Y_train is businesses and zero otherwise
Y_train1 = calculate_electronic_devices(Y_train) #o1 is a list which is one when the cat1 in the Y_train is electronic-devices and zero otherwise
Y_train2 = calculate_for_the_home(Y_train) #o2 is a list which is one when the cat1 in the Y_train is for-the-home and zero otherwise
Y_train3 = calculate_leisure_hobbies(Y_train) #o3 is a list which is one when the cat1 in the Y_train is leisure-hobbies and zero otherwise
Y_train4 = calculate_personal(Y_train) #o4 is a list which is one when the cat1 in the Y_train is personal and zero otherwise
Y_train5 = calculate_vehicles(Y_train) #o5 is a list which is one when the cat1 in the Y_train is vehicles and zero otherwise

In [190]:
for i in range(5):  #printing first 5 rows of classes_of_Y_train and their respective values in Y_train0 to Y_train5
    print(Y_train[i]," " , Y_train0[i], Y_train1[i], Y_train2[i], Y_train3[i], Y_train4[i], Y_train5[i])

4   0 0 0 0 1 0
5   0 0 0 0 0 1
5   0 0 0 0 0 1
5   0 0 0 0 0 1
5   0 0 0 0 0 1


Implementing logistic regression: <br>
Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable.

In [191]:
# Initializing logistic regression by determining the features and weights
feature = []
a = []
for i in range(len(title_train)):
    a.append(title_train[i])
    a.append(description_train[i])
    feature.append(a)    
    a = []

w0 = np.random.uniform(low = 0, high = 1) #initializing the wight0 with a random uniform real number
w1 = np.random.uniform(low = 0, high = 1) #initializing the wight1 with a random uniform real number
w2 = np.random.uniform(low = 0, high = 1) #initializing the wight2 with a random uniform real number

weights = np.array([w1, w2])
features = np.array(feature) #features is a 2d numpy array that has title_train as first column and description_train as the second 


# z = w0 + w1*title + w2*description

for i in range(5): #printing some features and their corresponding labels
    print(features[i])

print(features.shape)
print(weights.shape)
z = np.dot(features, weights)
print(len(z))

[2.15217283 0.41420206]
[2.19722451 0.57983615]
[2.65064345 1.8039019 ]
[1.02012679 1.30388006]
[2.1674174  0.50019512]
(800, 2)
(2,)
800


In [192]:
# Initializing logistic regression for businesses
label = []
for i in range(len(title_train)):
    label.append(Y_train0[i]) #businesses
actual_business_labels = np.array(label) #labels are the desires output or Y_train also an numpy array
print(actual_business_labels.shape)

(800,)


In [193]:
# Initializing logistic regression for electronic_devices
label = []
for i in range(len(title_train)):
    label.append(Y_train1[i]) #electronic_devices
actual_electronic_devices_labels = np.array(label) #labels are the desires output or Y_train also an numpy array
print(actual_electronic_devices_labels.shape)

(800,)


In [194]:
# Initializing logistic regression for for_the_home
label = []
for i in range(len(title_train)):
    label.append(Y_train2[i]) #for_the_home
actual_for_the_home_labels = np.array(label) #labels are the desires output or Y_train also an numpy array
print(actual_for_the_home_labels.shape)

(800,)


In [195]:
# Initializing logistic regression for leisure_hobbies
label = []
for i in range(len(title_train)):
    label.append(Y_train3[i]) #leisure_hobbies
actual_leisure_hobbies_labels = np.array(label) #labels are the desires output or Y_train also an numpy array
print(actual_leisure_hobbies_labels.shape)

(800,)


In [196]:
# Initializing logistic regression for personal
label = []
for i in range(len(title_train)):
    label.append(Y_train4[i]) #personal
actual_personal_labels = np.array(label) #labels are the desires output or Y_train also an numpy array
print(actual_personal_labels.shape)

(800,)


In [197]:
# Initializing logistic regression for vehicles
label = []
for i in range(len(title_train)):
    label.append(Y_train5[i]) #vehicles
actual_vehicles_labels = np.array(label) #labels are the desires output or Y_train also an numpy array
print(actual_vehicles_labels.shape)

(800,)


In [198]:
def sigmoid(z):
    """
    Sigmoid function to map calculated value to a probablity
    """   
    return 1 / (1 + np.exp(-z)) #sigmoid(z) = 1/(1 + e^(-z))

In [199]:
def predict(features, weights):
  '''
  Returns 1D array of probabilities
  that the class label == 1
  '''
  z = np.dot(features, weights)
  return sigmoid(z)

In [200]:
# print(predict(features, weights))
# print(np.log(predict(features, weights)))
# # print(np.log(1 - predict(features, weights)))
# print(labels)

In [292]:
def cost_function(features, labels, weights):
    '''
    Using Mean Absolute Error

    Features:(400,2)
    Labels: (400,1)
    Weights:(2,1)
    Returns 1D matrix of predictions
    Cost = (labels*log(predictions) + (1-labels)*log(1-predictions) ) / len(labels)
    '''
    observations = len(labels)

    predictions = predict(features, weights)

    #Take the error when label=1, If label=0, the first side cancels out.
    class1_cost = -labels*np.log(predictions) 

    #Take the error when label=0, If label=1, the second side cancels out. 
    class2_cost = (1-labels)*np.log(1-predictions) #a negative number

    #Take the sum of both costs
    cost = class1_cost - class2_cost #a positive number

    #Take the average cost
    cost = cost.sum() / observations

    return cost

In [293]:
def update_weights(features, labels, weights, lr):
    '''
    Vectorized Gradient Descent

    Features:(200, 3)
    Labels: (200, 1)
    Weights:(3, 1)
    '''
    N = len(features)

    #1 - Get Predictions
    predictions = predict(features, weights)

    #2 Transpose features from (200, 3) to (3, 200)
    # So we can multiply w the (200,1)  cost matrix.
    # Returns a (3,1) matrix holding 3 partial derivatives --
    # one for each feature -- representing the aggregate
    # slope of the cost function across all observations
    gradient = np.dot(features.T,  predictions - labels)

    #3 Take the average cost derivative for each feature
    gradient /= N

    #4 - Multiply the gradient by our learning rate
    gradient *= lr

    #5 - Subtract from our weights to minimize cost
    weights -= gradient

    return weights

In [294]:
def decision_boundary(prob):
    return 1 if prob >= .5 else 0 

In [295]:
def classify(predictions):
    '''
    input  - N element array of predictions between 0 and 1
    output - N element array of 0s (False) and 1s (True)
    '''
    predicted_labels = []
    for i in range(len(predictions)):
        predicted_labels.append(decision_boundary(predictions[i]))
    return predicted_labels


In [296]:
def train(features, labels, weights, learning_rate, iters):
    cost_history = []

    for i in range(iters):
        weights = update_weights(features, labels, weights, learning_rate)

        #Calculate error for auditing purposes
        cost = cost_function(features, labels, weights)
        cost_history.append(cost)

        # Log Progress
        if i % 1000 == 0:
            print ("iter: ",str(i) , " cost: ",str(cost))

    return weights, cost_history

In [297]:
def accuracy(predicted_labels, actual_labels):
    diff = predicted_labels - actual_labels
    return 1.0 - (float(np.count_nonzero(diff)) / len(diff))

In [207]:
# predictions = predict(features, weights)
# predicted_labels = classify(predictions) #labels predicted by the classifier
# print(predicted_labels)
# print(actual_business_labels)

TRAINING THE MODELS TO PREDICT EACH CATEGORY (y = w1 x title + w2 x description)

In [226]:
business_weights, business_cost_history = train(features, actual_business_labels, weights, 0.35, 10000)

iter:  0  cost:  0.3319393389962942
iter:  1000  cost:  0.29407478291248806
iter:  2000  cost:  0.29407478286842753
iter:  3000  cost:  0.2940747828684273
iter:  4000  cost:  0.2940747828684273
iter:  5000  cost:  0.2940747828684273
iter:  6000  cost:  0.2940747828684273
iter:  7000  cost:  0.2940747828684273
iter:  8000  cost:  0.2940747828684273
iter:  9000  cost:  0.2940747828684273


In [227]:
business_predictions = predict(features, weights)
business_predicted_labels = classify(business_predictions) #labels predicted by the classifier

In [228]:
electronic_weights, electronic_cost_history = train(features, actual_electronic_devices_labels, weights, 0.35, 10000)

iter:  0  cost:  0.5446307948517176
iter:  1000  cost:  0.49531366437078383
iter:  2000  cost:  0.4953136643707839
iter:  3000  cost:  0.4953136643707839
iter:  4000  cost:  0.4953136643707839
iter:  5000  cost:  0.4953136643707839
iter:  6000  cost:  0.4953136643707839
iter:  7000  cost:  0.4953136643707839
iter:  8000  cost:  0.4953136643707839
iter:  9000  cost:  0.4953136643707839


In [229]:
electronic_predictions = predict(features, weights)
electronic_predicted_labels = classify(electronic_predictions) #labels predicted by the classifier

In [230]:
for_the_home_weights, for_the_home_cost_history = train(features, actual_for_the_home_labels, weights, 0.35, 10000)

iter:  0  cost:  0.6395098071866425
iter:  1000  cost:  0.6071848906278104
iter:  2000  cost:  0.6071848906278104
iter:  3000  cost:  0.6071848906278104
iter:  4000  cost:  0.6071848906278104
iter:  5000  cost:  0.6071848906278104
iter:  6000  cost:  0.6071848906278104
iter:  7000  cost:  0.6071848906278104
iter:  8000  cost:  0.6071848906278104
iter:  9000  cost:  0.6071848906278104


In [231]:
for_the_home_predictions = predict(features, weights)
for_the_home_predicted_labels = classify(for_the_home_predictions) #labels predicted by the classifier

In [232]:
hobbies_weights, hobbies_cost_history = train(features, actual_leisure_hobbies_labels, weights, 0.35, 10000)

iter:  0  cost:  0.4473105031571346
iter:  1000  cost:  0.3665605555663041
iter:  2000  cost:  0.3665605492780547
iter:  3000  cost:  0.3665605492780526
iter:  4000  cost:  0.3665605492780526
iter:  5000  cost:  0.3665605492780526
iter:  6000  cost:  0.3665605492780526
iter:  7000  cost:  0.3665605492780526
iter:  8000  cost:  0.3665605492780526
iter:  9000  cost:  0.3665605492780526


In [233]:
hobbies_predictions = predict(features, weights)
hobbies_predicted_labels = classify(hobbies_predictions) #labels predicted by the classifier

In [234]:
personal_weights, personal_cost_history = train(features, actual_personal_labels, weights, 0.35, 10000)

iter:  0  cost:  0.5002052850420651
iter:  1000  cost:  0.45498903133205965
iter:  2000  cost:  0.4549890313320597
iter:  3000  cost:  0.4549890313320597
iter:  4000  cost:  0.4549890313320597
iter:  5000  cost:  0.4549890313320597
iter:  6000  cost:  0.4549890313320597
iter:  7000  cost:  0.4549890313320597
iter:  8000  cost:  0.4549890313320597
iter:  9000  cost:  0.4549890313320597


In [235]:
personal_predictions = predict(features, weights)
personal_predicted_labels = classify(personal_predictions) #labels predicted by the classifier

In [236]:
vehicle_weights, vehicle_cost_history = train(features, actual_vehicles_labels, weights, 0.35, 10000)

iter:  0  cost:  0.5731219626350369
iter:  1000  cost:  0.5548489591124812
iter:  2000  cost:  0.554848959112481
iter:  3000  cost:  0.554848959112481
iter:  4000  cost:  0.554848959112481
iter:  5000  cost:  0.554848959112481
iter:  6000  cost:  0.554848959112481
iter:  7000  cost:  0.554848959112481
iter:  8000  cost:  0.554848959112481
iter:  9000  cost:  0.554848959112481


In [237]:
vehicle_predictions = predict(features, weights)
vehicle_predicted_labels = classify(vehicle_predictions) #labels predicted by the classifier

Accuracies For Each Model

In [238]:
print("Business Model:",accuracy(business_predicted_labels, actual_business_labels),"\n",
      "Electronic Model:", accuracy(electronic_predicted_labels, actual_electronic_devices_labels),"\n",
      "For-the-home Model:", accuracy(for_the_home_predicted_labels, actual_for_the_home_labels),"\n",
      "Hobbies Model:", accuracy(hobbies_predicted_labels, actual_leisure_hobbies_labels),"\n",
      "Personal Model:", accuracy(personal_predicted_labels, actual_personal_labels),"\n",
      "Vehicle Model:", accuracy(vehicle_predicted_labels, actual_vehicles_labels))

Business Model: 0.9375 
 Electronic Model: 0.81125 
 For-the-home Model: 0.725 
 Hobbies Model: 0.90375 
 Personal Model: 0.8625 
 Vehicle Model: 0.76


In [239]:
def softmax(x):
    """Using softmax to assign a class to each text sample"""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

Assigning values to each text sample using predictions acquired for each model and softmax function

In [240]:
print(business_predictions[0])
print(electronic_predictions[0])
print(for_the_home_predictions[0])
print(hobbies_predictions[0])
print(personal_predictions[0])
print(vehicle_predictions[0])
first_prediction = [business_predictions[0], electronic_predictions[0], for_the_home_predictions[0], 
              hobbies_predictions[0], personal_predictions[0], vehicle_predictions[0]]
probs = softmax(first_prediction).tolist()
print(probs)
print(probs.index(max(probs)))
# print(softmax(first_prediction).sum())
def multiclass_predictor(business_predictions, electronic_predictions, for_the_home_predictions, 
                 hobbies_predictions, personal_predictions, vehicle_predictions):
    """
    In this method, given the predictions done by each binary classifier, we assign a class to each text sample
    """  
    predicted_categories = []
    for i in range(len(business_predictions)):
        prediction = [business_predictions[i], electronic_predictions[i], for_the_home_predictions[i], 
              hobbies_predictions[i], personal_predictions[i], vehicle_predictions[i]] #list containing predictions
        probs = softmax(prediction).tolist() # a list of 6 values which each element rpresents the probability for that class
        predicted_categories.append(probs.index(max(probs))) #assigning the maximum probability as category
        
    return predicted_categories
        

0.14959185475532819
0.2902197105806271
0.3471917070385001
0.2510124228975926
0.25158763926172684
0.36510075820391613
[0.14653979694767963, 0.16866675408535248, 0.17855503898943906, 0.16218174858754006, 0.1622750650192878, 0.18178159637070088]
5


In [241]:
final_predictions = multiclass_predictor(business_predictions, electronic_predictions, for_the_home_predictions, 
                 hobbies_predictions, personal_predictions, vehicle_predictions)

In [242]:
# Calculating the accuracy of the final model
final_predictions = np.array(final_predictions)
Y_train = np.array(Y_train)
final_model_accuracy = accuracy(final_predictions, Y_train)
print(final_model_accuracy)

0.30000000000000004


In [265]:
for i in range(10):
    print(Y_train[i], final_predictions[i])

4 5
5 2
5 2
5 2
5 2
4 5
1 5
4 5
2 2
3 5


In [243]:
def calculate_cross_entropy_loss_function(predicted_y, actual_y):
    """
    cross entropy loss function calculates the distance between predicted output, and true output helping us to update our 
    weights and threshold, to have better predictions.
    predicted_y : the predicted output
    actual_y : the actual output
    """   
    return -(actual_y * math.log(predicted_y) + (1 - actual_y) * math.log(1 - predicted_y))

In [244]:
def gradient_descent(sigmoid, numerical_X_train, actual_y):
    """
    gradient descent algorithm, helps updating the wights.
    sigmoid : the predicted y, more precisely, the probability of being in the class
    numerical_X_train : the input data
    actual_y : the actual output
    """  
    derivatives = []
    for i in range(len(numerical_X_train)):
        corresponding_input_value = numerical_X_train[i]
        derivatives.append((sigmoid - actual_y) * corresponding_input_value)
    return derivatives

In [245]:
def update_weights(w, b, X, Y, learning_rate, previous_cost):
    """
    this method is used to update the weights and the threshold 
    w : the list of weights
    b : the threshold
    X : the input data, comes from a sample text
    learning_rate : the rate in which we wish to change our weights and threshold
    previous_cost : the cost before the updating
    """   
    latest_w = w
    latest_b = b
    w_deriv = 0 #derivative of the weights
    b_deriv = 0 #derivative of the threshold
    N = len(X) #the length of the input data
    for i in range(N):
        # Calculate partial derivatives
        # -2x(y - (wx + b))
        w_deriv += -2*X[i] * (Y - (w[i]*X[i] + b))
        # We subtract because the derivatives point in direction of steepest ascent
        w[i] -= (w_deriv / float(N)) * learning_rate
        
    # -2(y - (wx + b))
    b_deriv += -2*(Y - (np.mean(w)*X[0] + b))
    b -= (b_deriv / float(N)) * learning_rate
    z = weighted_sum_of_the_evidence_for_class(X, w, b) #calculating the weighted sum of the evidence for class
    probability_of_being_in_class = sigmoid(z) #calculating the probability of being in that specific class
    new_cost = calculate_cross_entropy_loss_function(probability_of_being_in_class, Y) #calculating the cost
    if(new_cost > previous_cost):
        w, b = latest_w, latest_b
    return w, b

Calculating Scores

In [247]:
def calculate_true_positive(y_true, y_pred, category):
    """
    This method calculates the number of correct positive predictions for a specific class
    y_true : the list containing actual outputs
    y_pred : the list containing predicted outputs
    """  
    true_positives = 0
    length_of_data = len(y_true)
    #loop over the actual outputs and if the category in y_pred was also that specific class,
    #this means that the model correctly predicted positively
    for i in range(length_of_data): 
        if y_true[i] == category:
            if y_pred[i] == category:
                true_positives += 1 
    return true_positives

In [248]:
def calculate_true_negative(y_true, y_pred, category):
    """
    This method calculates the number of correct negative predictions for a specific class
    y_true : the list containing actual outputs
    y_pred : the list containing predicted outputs
    """ 
    true_negatives = 0
    length_of_data = len(y_true)
    #loop over the actual outputs and if the category was not that specific class, and also the y_pred was different
    #this means that the model correctly predicted negatively
    for i in range(length_of_data): 
        if y_true[i] != category:
            if y_pred[i] != category:
                true_negatives += 1 
    return true_negatives

In [249]:
def calculate_false_positive(y_true, y_pred, category):
    """
    This method calculates the number of false positive predictions for a specific class
    y_true : the list containing actual outputs
    y_pred : the list containing predicted outputs
    """ 
    false_positives = 0
    length_of_data = len(y_true)
    #loop over the predicted outputs; if the predicted output was category but the actual output (y_true) was not
    #this means that the model falsely predicted positively
    for i in range(length_of_data): 
        if y_pred[i] == category:
            if y_true[i] != category:
                false_positives += 1 
    return false_positives

In [250]:
def calculate_false_negative(y_true, y_pred, category):
    """
    This method calculates the number of false negative predictions for a specific class
    y_true : the list containing actual outputs
    y_pred : the list containing predicted outputs
    """ 
    false_negatives = 0
    length_of_data = len(y_true)
    #loop over the actual outputs and if the category was that specific class, and also the y_pred was different
    #this means that the model falsely predicted negatively
    for i in range(length_of_data): 
        if y_true[i] == category:
            if y_pred[i] != category:
                false_negatives += 1 
    return false_negatives

In [251]:
def calculate_accuracy(true_positive, true_negative, false_positive, false_negative):
    """
    This method calculates the accuracy of the model for a specific class
    true_positive : the samples that were positive and were predicted correctly
    true_negative : the samples that were negative and were predicted correctly
    false_positive : the samples that were negative and were falsely predicted as positive
    false_negative : the samples that were positive and were falsely predicted as negative
    """  
    accuracy = 0.0
    accuracy = round((true_positive + true_negative)/(true_positive + true_negative + false_positive + false_negative), 3)
    return accuracy

In [252]:
def calculate_percision(true_positive, false_positive):
    """
    This method calculates the percision of the model for a specific class
    true_positive : the samples that were positive and were predicted correctly
    false_positive : the samples that were negative and were falsely predicted as positive
    """  
    percision = 0.0
    percision = round(true_positive/(true_positive + false_positive) , 3)
    return percision

In [253]:
def calculate_recall(true_positive, false_negative):
    """
    This method calculates the recall of the model for a specific class
    true_positive : the samples that were positive and were predicted correctly
    false_negative : the samples that were positive and were falsely predicted as negative
    """  
    recall = 0.0
    recall = round(true_positive/(true_positive + false_negative) , 3)
    return recall

In [254]:
def calculate_f1_score(recall, percision):
    """
    This method calculates the f1 score of the model for a specific class
    percision : the percision of the model for that class
    recall : the recall of the model for that class
    """  
    f1_score = 0.0
    f1_score = round((2 * recall * percision)/(recall + percision) , 3)
    return f1_score

In [255]:
def calculate_support(y_true, category):
    """
    This method calculates the number of samples belong to a specific category in the y_true
    y_true : the list containing actual outputs
    category : the category that we are counting the number of
    """  
    count = 0
    length_of_data = len(y_true)
    for i in range(length_of_data):
        if y_true[i] == category:
            count += 1
    return count

In [256]:
def calculate_macro_average(f1_arr, percision_arr, recall_arr):
    """
    An arithmetic mean of per-class F1, percision, and recall
    f1_arr : a list containing f1_scores per class
    percision_arr : a list containing percision per class
    recall_arr : a list containing recall per class
    """ 
    macro_averaged_f1 = round(np.mean(f1_arr), 3)
    macro_averaged_percision = round(np.mean(percision_arr), 3)
    macro_averaged_recall = round(np.mean(recall_arr), 3)
    return macro_averaged_f1, macro_averaged_percision, macro_averaged_recall 

In [257]:
def calculate_weighted_average(f1_arr, percision_arr, recall_arr, y_true):
    """
    An arithmetic mean of per-class F1, percision, and recall, but also weights the score of each class by 
    the number of samplesfrom that class. This method works well based on the assumption that the f1_arr, 
    percision_arr, and recall_arr are given based on the alphabetic sort of their categories.
    f1_arr : a list containing f1_scores per class
    percision_arr : a list containing percision per class
    recall_arr : a list containing recall per class
    y_true : the list containing actual outputs
    """ 
    samples_length = len(y_true)
    counts = [] #a list containing the number of samples in each category
    myset = set(y_true) #converting to set
    categories = list(myset) #categories
    sorted_categories = sorted(categories)
#     print(sorted_categories)
    
    for i in range(len(categories)):
        counts.append(y_true.count(sorted_categories[i]))
#     print(counts)
    f1 = 0
    prec = 0
    rec = 0
#     for i in range(len(f1_arr)):
#         print(f1_arr[i], " ", counts[i])
    for i in range(len(f1_arr)):
        f1 += (f1_arr[i] * counts[i])  
        prec += (percision_arr[i] * counts[i])
        rec += (recall_arr[i] * counts[i])
    weighted_f1 = round(f1/samples_length, 3)  
    weighted_percision = round(prec/samples_length, 3)
    weighted_recall = round(rec/samples_length, 3)
    
    return weighted_f1, weighted_percision, weighted_recall 

In [258]:
def calculate_micro_average(confusion_matrix):
    """
    The following always holds true for the micro-F1 case:
    micro-F1 = micro_percision = micro_recall = accuracy
    We first calculate micro_percision and micro_recall and then combine the two
    f1_arr : a list containing f1_scores per class
    percision_arr : a list containing percision per class
    recall_arr : a list containing recall per class
    y_true : the list containing actual outputs
    """ 
    #in multi-class, all the correctly predicted samples are true positives.
    #gettin the diagonal
    true_positives = 0
    j = 0
    for i in range(len(confusion_matrix)):
        true_positives += confusion_matrix[i][j]
        j += 1
    
#     print(true_positives)
    #each prediction error is a false positive for the class that we predicted
    #again, the total number of false negatives is the total number of prediction errors
    #false_negatives = false_positives
    false_positives = 0
    for i in range(len(confusion_matrix)): #looping over the confusion matrix
        for j in range(len(confusion_matrix)):
            if i != j:  #passing the diagonal
                false_positives += confusion_matrix[i][j]
#     print(false_positives)    
    accuracy = true_positives/(true_positives + false_positives)  
    return round(accuracy, 3)
        

In [267]:
from sklearn import metrics
# Constants
B = "businesses"
E = "electronic-devices"
F = "for-the-home"
L = "leisure-hobbies"
P = "personal"
V = "vehicles"

# True values
y_true = [B, E, B, E, F, L, V, V, V, P, V, B, E, F, L, P, V]
# Predicted values
y_pred = [E, B, F, L, L, L, V, P, P, B, V, B, E, F, L, P, V]

# Print the confusion matrix
print(metrics.confusion_matrix(y_true, y_pred))

# Print the precision and recall, among other metrics
print(metrics.classification_report(y_true, y_pred, digits=3))

[[1 1 1 0 0 0]
 [1 1 0 1 0 0]
 [0 0 1 1 0 0]
 [0 0 0 2 0 0]
 [1 0 0 0 1 0]
 [0 0 0 0 2 3]]
                    precision    recall  f1-score   support

        businesses      0.333     0.333     0.333         3
electronic-devices      0.500     0.333     0.400         3
      for-the-home      0.500     0.500     0.500         2
   leisure-hobbies      0.500     1.000     0.667         2
          personal      0.333     0.500     0.400         2
          vehicles      1.000     0.600     0.750         5

          accuracy                          0.529        17
         macro avg      0.528     0.544     0.508        17
      weighted avg      0.598     0.529     0.534        17



In [260]:
from sklearn import metrics
# Constants
C="Cat"
F="Fish"
H="Hen"

# True values
y_true = [C,C,C,C,C,C, F,F,F,F,F,F,F,F,F,F, H,H,H,H,H,H,H,H,H]
# Predicted values
y_pred = [C,C,C,C,H,F, C,C,C,C,C,C,H,H,F,F, C,C,C,H,H,H,H,H,H]

# Print the confusion matrix
confusion_matrix = metrics.confusion_matrix(y_true, y_pred)
print(confusion_matrix)

# Print the precision and recall, among other metrics
print(metrics.classification_report(y_true, y_pred, digits=3))

[[4 1 1]
 [6 2 2]
 [3 0 6]]
              precision    recall  f1-score   support

         Cat      0.308     0.667     0.421         6
        Fish      0.667     0.200     0.308        10
         Hen      0.667     0.667     0.667         9

    accuracy                          0.480        25
   macro avg      0.547     0.511     0.465        25
weighted avg      0.581     0.480     0.464        25



In [261]:
#based on the category of the previous four calculations for TP, TN, FP, FN, 
# the acuracy, percision, recall and f1 score will be calculated for the same category
true_positive = calculate_true_positive(y_true, y_pred, C) 
true_negative = calculate_true_negative(y_true, y_pred, C)
false_positive = calculate_false_positive(y_true, y_pred, C)
false_negative = calculate_false_negative(y_true, y_pred, C)
accuracy_for_cat = calculate_accuracy(true_positive, true_negative, false_positive, false_negative)
percision_for_cat = calculate_percision(true_positive, false_positive)
recall_for_cat = calculate_recall(true_positive, false_negative)
f1_score_for_cat = calculate_f1_score(recall_for_cat, percision_for_cat)

In [262]:
print(true_positive, " ", true_negative, " ", false_positive, " ", false_negative) 
print(accuracy_for_cat, " ", percision_for_cat, " ", recall_for_cat, " ", f1_score_for_cat)

4   10   9   2
0.56   0.308   0.667   0.421


In [268]:
f1 = [0.421, 0.308, 0.667]
percision = [0.308, 0.667, 0.667]
recall = [0.667, 0.200, 0.667]
macro_f1, macro_percision, macro_recall = calculate_macro_average(f1, percision, recall)
weighted_f1, weighted_percision, weighted_recall = calculate_weighted_average(f1, percision, recall, y_true)
accuracy =  calculate_micro_average(confusion_matrix)

In [269]:
f1 = [0.333, 0.400, 0.500, 0.667, 0.400, 0.750]
percision = [0.333, 0.500, 0.500, 0.500, 0.333, 1.000]
recall = [0.333, 0.333, 0.500, 1.000, 0.500, 0.600]
macro_f1, macro_percision, macro_recall = calculate_macro_average(f1, percision, recall)
weighted_f1, weighted_percision, weighted_recall = calculate_weighted_average(f1, percision, recall, y_true)
accuracy =  calculate_micro_average(confusion_matrix)

In [270]:
print(macro_f1, " ", macro_percision, " ", macro_recall) 

0.508   0.528   0.544


In [271]:
print(weighted_f1, " ", weighted_percision, " ", weighted_recall)

0.534   0.598   0.529


In [272]:
print(accuracy)

0.48


the average methods need fixing to do the job for other dataframes.
the confusion matrix function should be implemented

In [273]:
from sklearn import metrics


# Print the confusion matrix
print(metrics.confusion_matrix(Y_train, final_predictions))

# Print the precision and recall, among other metrics
print(metrics.classification_report(Y_train, final_predictions, digits=3))

[[  0   0  34   0   0  16]
 [  0   0  99   0   0  52]
 [  0   0 160   0   0  60]
 [  0   0  45   0   0  32]
 [  0   0  77   0   0  33]
 [  0   0 112   0   0  80]]
              precision    recall  f1-score   support

           0      0.000     0.000     0.000        50
           1      0.000     0.000     0.000       151
           2      0.304     0.727     0.428       220
           3      0.000     0.000     0.000        77
           4      0.000     0.000     0.000       110
           5      0.293     0.417     0.344       192

    accuracy                          0.300       800
   macro avg      0.099     0.191     0.129       800
weighted avg      0.154     0.300     0.200       800



  'precision', 'predicted', average, warn_for)


In [278]:
# implementation of confusion matrix
import pandas as pd
y_actu = pd.Series(Y_train, name='Actual')
y_pred = pd.Series(final_predictions, name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [279]:
df_confusion

Predicted,2,5,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,34,16,50
1,99,52,151
2,160,60,220
3,45,32,77
4,77,33,110
5,112,80,192
All,527,273,800


In [298]:
# calcularing the accuracy for validation set
descriptions = validationset['desc']
titles = validationset['title']
cleared_descriptions = preprocessing(descriptions.head(1000)) #preprocessed descriptions
cleared_titles = preprocessing(titles.head(1000)) #preprocessed titles
#the following are all lists
#A list containing numerical title and description for each text sample in the training set
titles, descriptions = get_numerical_features(non_numerical_title, non_numerical_description) 
Y = categorizer(non_numerical_Y) #A list containing an integer for the category in cat1
title_train, title_test, description_train, description_test, Y_train, Y_test = train_test_split(
        titles, descriptions, Y, test_size=0.2, random_state=42)

In [299]:
# Thes are the desired outputs only for the train part of the trainset in the 6 class of categories
#We need these values in order to train our model to predict the X values in the validation set
Y_train0 = calculate_businesses(Y_train) #o0 is a list which is one when the cat1 in the Y_train is businesses and zero otherwise
Y_train1 = calculate_electronic_devices(Y_train) #o1 is a list which is one when the cat1 in the Y_train is electronic-devices and zero otherwise
Y_train2 = calculate_for_the_home(Y_train) #o2 is a list which is one when the cat1 in the Y_train is for-the-home and zero otherwise
Y_train3 = calculate_leisure_hobbies(Y_train) #o3 is a list which is one when the cat1 in the Y_train is leisure-hobbies and zero otherwise
Y_train4 = calculate_personal(Y_train) #o4 is a list which is one when the cat1 in the Y_train is personal and zero otherwise
Y_train5 = calculate_vehicles(Y_train) #o5 is a list which is one when the cat1 in the Y_train is vehicles and zero otherwise

In [300]:
# Initializing logistic regression by determining the features and weights
feature = []
a = []
for i in range(len(title_train)):
    a.append(title_train[i])
    a.append(description_train[i])
    feature.append(a)    
    a = []

w0 = np.random.uniform(low = 0, high = 1) #initializing the wight0 with a random uniform real number
w1 = np.random.uniform(low = 0, high = 1) #initializing the wight1 with a random uniform real number
w2 = np.random.uniform(low = 0, high = 1) #initializing the wight2 with a random uniform real number

weights = np.array([w1, w2])
features = np.array(feature) #features is a 2d numpy array that has title_train as first column and description_train as the second 


# z = w0 + w1*title + w2*description

for i in range(5): #printing some features and their corresponding labels
    print(features[i])

print(features.shape)
print(weights.shape)
z = np.dot(features, weights)
print(len(z))

[2.2831345  0.42813806]
[2.20611815 0.575463  ]
[2.81329034 1.97718869]
[1.00838878 1.42689962]
[2.31509918 0.53565907]
(800, 2)
(2,)
800


TRAINING THE MODELS TO PREDICT EACH CATEGORY (y = w1 x title + w2 x description)

In [303]:
# Initializing logistic regression for businesses
label = []
for i in range(len(title_train)):
    label.append(Y_train0[i]) #businesses
actual_business_labels = np.array(label) #labels are the desires output or Y_train also an numpy array
print(actual_business_labels.shape)

# Initializing logistic regression for electronic_devices
label = []
for i in range(len(title_train)):
    label.append(Y_train1[i]) #electronic_devices
actual_electronic_devices_labels = np.array(label) #labels are the desires output or Y_train also an numpy array
print(actual_electronic_devices_labels.shape)

# Initializing logistic regression for for_the_home
label = []
for i in range(len(title_train)):
    label.append(Y_train2[i]) #for_the_home
actual_for_the_home_labels = np.array(label) #labels are the desires output or Y_train also an numpy array
print(actual_for_the_home_labels.shape)

# Initializing logistic regression for leisure_hobbies
label = []
for i in range(len(title_train)):
    label.append(Y_train3[i]) #leisure_hobbies
actual_leisure_hobbies_labels = np.array(label) #labels are the desires output or Y_train also an numpy array
print(actual_leisure_hobbies_labels.shape)

# Initializing logistic regression for personal
label = []
for i in range(len(title_train)):
    label.append(Y_train4[i]) #personal
actual_personal_labels = np.array(label) #labels are the desires output or Y_train also an numpy array
print(actual_personal_labels.shape)

# Initializing logistic regression for vehicles
label = []
for i in range(len(title_train)):
    label.append(Y_train5[i]) #vehicles
actual_vehicles_labels = np.array(label) #labels are the desires output or Y_train also an numpy array
print(actual_vehicles_labels.shape)


(800,)
(800,)
(800,)
(800,)
(800,)
(800,)


In [304]:
business_weights, business_cost_history = train(features, actual_business_labels, weights, 0.35, 10000)
business_predictions = predict(features, weights)
business_predicted_labels = classify(business_predictions) #labels predicted by the classifier

iter:  0  cost:  0.2924488354336976
iter:  1000  cost:  0.2924488354336976
iter:  2000  cost:  0.2924488354336976
iter:  3000  cost:  0.2924488354336976
iter:  4000  cost:  0.2924488354336976
iter:  5000  cost:  0.2924488354336976
iter:  6000  cost:  0.2924488354336976
iter:  7000  cost:  0.2924488354336976
iter:  8000  cost:  0.2924488354336976
iter:  9000  cost:  0.2924488354336976


In [305]:
electronic_weights, electronic_cost_history = train(features, actual_electronic_devices_labels, weights, 0.35, 10000)
electronic_predictions = predict(features, weights)
electronic_predicted_labels = classify(electronic_predictions) #labels predicted by the classifier

iter:  0  cost:  0.547466200979262
iter:  1000  cost:  0.49741998271086735
iter:  2000  cost:  0.4974199827108673
iter:  3000  cost:  0.4974199827108673
iter:  4000  cost:  0.4974199827108673
iter:  5000  cost:  0.4974199827108673
iter:  6000  cost:  0.4974199827108673
iter:  7000  cost:  0.4974199827108673
iter:  8000  cost:  0.4974199827108673
iter:  9000  cost:  0.4974199827108673


In [306]:
for_the_home_weights, for_the_home_cost_history = train(features, actual_for_the_home_labels, weights, 0.35, 10000)
for_the_home_predictions = predict(features, weights)
for_the_home_predicted_labels = classify(for_the_home_predictions) #labels predicted by the classifier

iter:  0  cost:  0.6406565983344934
iter:  1000  cost:  0.6081959318058575
iter:  2000  cost:  0.6081959318058575
iter:  3000  cost:  0.6081959318058575
iter:  4000  cost:  0.6081959318058575
iter:  5000  cost:  0.6081959318058575
iter:  6000  cost:  0.6081959318058575
iter:  7000  cost:  0.6081959318058575
iter:  8000  cost:  0.6081959318058575
iter:  9000  cost:  0.6081959318058575


In [307]:
hobbies_weights, hobbies_cost_history = train(features, actual_leisure_hobbies_labels, weights, 0.35, 10000)
hobbies_predictions = predict(features, weights)
hobbies_predicted_labels = classify(hobbies_predictions) #labels predicted by the classifier

iter:  0  cost:  0.44494496413397394
iter:  1000  cost:  0.3662833083811688
iter:  2000  cost:  0.3662833066838577
iter:  3000  cost:  0.36628330668385756
iter:  4000  cost:  0.36628330668385756
iter:  5000  cost:  0.36628330668385756
iter:  6000  cost:  0.36628330668385756
iter:  7000  cost:  0.36628330668385756
iter:  8000  cost:  0.36628330668385756
iter:  9000  cost:  0.36628330668385756


In [308]:
personal_weights, personal_cost_history = train(features, actual_personal_labels, weights, 0.35, 10000)
personal_predictions = predict(features, weights)
personal_predicted_labels = classify(personal_predictions) #labels predicted by the classifier

iter:  0  cost:  0.5010844695611638
iter:  1000  cost:  0.4522056238110062
iter:  2000  cost:  0.4522056238110062
iter:  3000  cost:  0.4522056238110062
iter:  4000  cost:  0.4522056238110062
iter:  5000  cost:  0.4522056238110062
iter:  6000  cost:  0.4522056238110062
iter:  7000  cost:  0.4522056238110062
iter:  8000  cost:  0.4522056238110062
iter:  9000  cost:  0.4522056238110062


In [309]:
vehicle_weights, vehicle_cost_history = train(features, actual_vehicles_labels, weights, 0.35, 10000)
vehicle_predictions = predict(features, weights)
vehicle_predicted_labels = classify(vehicle_predictions) #labels predicted by the classifier

iter:  0  cost:  0.5745624667822271
iter:  1000  cost:  0.5547552348243195
iter:  2000  cost:  0.5547552348243195
iter:  3000  cost:  0.5547552348243195
iter:  4000  cost:  0.5547552348243195
iter:  5000  cost:  0.5547552348243195
iter:  6000  cost:  0.5547552348243195
iter:  7000  cost:  0.5547552348243195
iter:  8000  cost:  0.5547552348243195
iter:  9000  cost:  0.5547552348243195


In [310]:
print("Business Model:",accuracy(business_predicted_labels, actual_business_labels),"\n",
      "Electronic Model:", accuracy(electronic_predicted_labels, actual_electronic_devices_labels),"\n",
      "For-the-home Model:", accuracy(for_the_home_predicted_labels, actual_for_the_home_labels),"\n",
      "Hobbies Model:", accuracy(hobbies_predicted_labels, actual_leisure_hobbies_labels),"\n",
      "Personal Model:", accuracy(personal_predicted_labels, actual_personal_labels),"\n",
      "Vehicle Model:", accuracy(vehicle_predicted_labels, actual_vehicles_labels))

Business Model: 0.9375 
 Electronic Model: 0.81125 
 For-the-home Model: 0.725 
 Hobbies Model: 0.90375 
 Personal Model: 0.8625 
 Vehicle Model: 0.76


In [312]:
# multiclass prediction
final_predictions = multiclass_predictor(business_predictions, electronic_predictions, for_the_home_predictions, 
                 hobbies_predictions, personal_predictions, vehicle_predictions)

# Calculating the accuracy of the final model
final_predictions = np.array(final_predictions)
Y_train = np.array(Y_train)
final_model_accuracy = accuracy(final_predictions, Y_train)
print(final_model_accuracy)
for i in range(10):
    print(Y_train[i], final_predictions[i])

0.29500000000000004
4 5
5 2
5 2
5 2
5 5
4 5
1 5
4 5
2 2
3 5
