## Classification


In [1]:
# Loading the Libraries and Data
import pandas as pd
pd.set_option('display.max_colwidth', 500)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import _pickle as pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook as tqdm

# Loading in the original unclustered DF
with open("numuser_data.pkl",'rb') as fp:
    raw_df = pickle.load(fp)

# Loading in the clustered DF
with open("clustered_profiles.pkl",'rb') as fp:
    cluster_df = pickle.load(fp)

In [2]:
# Instantiating a new DF row to append later
new_profile = pd.DataFrame(columns=raw_df.columns)

# Adding random values for new data
for i in new_profile.columns[1:]:
    new_profile[i] = np.random.randint(0,10,1)

# Printing an user interface for inputting new values
print("Enter new profile information...\n\nExample Bio:\nBacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.")

# Asking for new profile data
new_profile['Bios'] = input("Enter bio: ")

# Indexing that new profile data
new_profile.index = [raw_df.index[-1] + 1]

Enter new profile information...

Example Bio:
Bacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.
Enter bio: food lover. dog lover


In [3]:
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

## Vectorizing and Scaling

In [4]:
# Assigning the split variables
X = cluster_df.drop(["Cluster #"], 1)
y = cluster_df['Cluster #']

In [5]:
## Vectorizing
# Instantiating the Vectorizer
vectorizer = CountVectorizer()

# Fitting the vectorizer to the Bios
x = vectorizer.fit_transform(X['Bios'])

# Creating a new DF that contains the vectorized words
df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())

# Concating the words DF with the original DF
X = pd.concat([X, df_wrds], axis=1)

# Dropping the Bios because it is no longer needed in place of vectorization
X.drop(['Bios'], axis=1, inplace=True)

In [6]:
## Scaling the Data
scaler = MinMaxScaler()

X = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)

In [7]:
# Vectorizing the new data
vect_new_prof = vectorizer.transform(new_profile['Bios'])

# Quick DF of the vectorized words
new_vect_w = pd.DataFrame(vect_new_prof.toarray(), columns=vectorizer.get_feature_names(), index=new_profile.index)

# Concatenating the DFs for the new profile data
new_vect_prof = pd.concat([new_profile, new_vect_w], 1).drop('Bios', 1)

# Scaling the new profile data
new_vect_prof = pd.DataFrame(scaler.transform(new_vect_prof), columns=new_vect_prof.columns, index=new_vect_prof.index)


## Modelling the User Profiles

In [8]:
# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Dummy
dummy = DummyClassifier(strategy='stratified')

# KNN
knn = KNeighborsClassifier()

# SVM
svm = SVC()

# List of models
models = [dummy, knn, svm]

# List of model names
names = ['Dummy', 'KNN', 'SVM']

# Zipping the lists
classifiers = dict(zip(names, models))

# Dictionary containing the model names and their scores
models_f1 = {}

# Looping through each model's predictions and getting their classification reports
for name, model in classifiers.items():
    # Fitting the model
    model.fit(X_train, y_train)
    
    print('\n'+ name + ' (Macro Avg - F1 Score):')
    
    # Classification Report
    report = classification_report(y_test, model.predict(X_test), output_dict=True)
    f1 = report['macro avg']['f1-score']
    
    # Assigning to the Dictionary
    models_f1[name] = f1
    
    print(f1)

# Printing out the best performing model    
print(max(models_f1, key=models_f1.get), 'Score:', max(models_f1.values()))


Dummy (Macro Avg - F1 Score):
0.09036562731882401

KNN (Macro Avg - F1 Score):
1.0

SVM (Macro Avg - F1 Score):
1.0
KNN Score: 1.0


## Using the SVM Classifier New Data

In [9]:
# Fitting the model
svm.fit(X, y)
# Classifying the new data 
designated_cluster = svm.predict(new_vect_prof)
# Narrowing down the dataset to only the designated cluster
des_cluster = (cluster_df[cluster_df['Cluster #']== designated_cluster[0]])

In [None]:
cluster_df

Unnamed: 0,Bios,Location,Interested in,Age,Movies,Music,Sports,Politics,Social Media,Cluster #
0,Social media geek. Freelance alcohol aficionado. Professional internet specialist. Avid writer.,4.0,0.0,2.0,9.0,5.0,5.0,0.0,8.0,7
1,Explorer. Incurable social media evangelist. Problem solver. Freelance pop culture junkie.,9.0,1.0,4.0,9.0,2.0,0.0,9.0,5.0,3
2,Friendly social media nerd. Beer guru. Future teen idol. Pop culture fanatic. Zombie enthusiast.,4.0,3.0,8.0,9.0,2.0,8.0,2.0,9.0,3
3,Beer enthusiast. Creator. Friendly bacon ninja. Extreme gamer. Total student. Wannabe travel geek. Unapologetic tv aficionado.,1.0,8.0,2.0,7.0,1.0,9.0,8.0,8.0,8
4,Lifelong communicator. Pop culture fanatic. Web fanatic. Proud creator. Reader. Certified writer.,2.0,5.0,1.0,1.0,5.0,5.0,0.0,1.0,2
...,...,...,...,...,...,...,...,...,...,...
4735,Total entrepreneur. Internet buff. Freelance beer ninja. Amateur student. Twitter maven. Tv specialist. General travel advocate. Coffee practitioner.,5.0,0.0,3.0,0.0,1.0,1.0,9.0,4.0,0
4736,Award-winning organizer. Future teen idol. Subtly charming creator. Music maven. Typical food junkie. Falls down a lot. Alcohol specialist.,0.0,1.0,6.0,2.0,3.0,6.0,2.0,3.0,8
4737,Entrepreneur. Coffee fanatic. Bacon nerd. Communicator. Prone to fits of apathy. Unapologetic zombie guru. Travel scholar. Writer.,6.0,7.0,5.0,4.0,5.0,7.0,7.0,8.0,6
4738,Certified explorer. Travel trailblazer. Friend of animals everywhere. Incurable alcohol enthusiast.,0.0,7.0,7.0,4.0,3.0,2.0,1.0,6.0,10


## Finding Top 10 Similar Profiles

In [10]:
# Appending the new profile data
des_cluster = des_cluster.append(new_profile, sort=False)

# Fitting the vectorizer to the Bios
cluster_x = vectorizer.fit_transform(des_cluster['Bios'])

# Creating a new DF that contains the vectorized words
cluster_v = pd.DataFrame(cluster_x.toarray(), index=des_cluster.index, columns=vectorizer.get_feature_names())

# Joining the Vectorized DF to the previous DF and dropping columns
des_cluster = des_cluster.join(cluster_v).drop(['Bios', 'Cluster #'], axis=1)


## Correlations
# Trasnposing the DF so that we are correlating with the index(users) and finding the correlation
corr = des_cluster.T.corr()

# Finding the Top 10 similar or correlated users to the new user
user_n = new_profile.index[0]

# Creating a DF with the Top 10 most similar profiles
top_10_sim = corr[[user_n]].sort_values(by=[user_n],axis=0, ascending=False)[1:11]

# Finally locating the Top 10 profiles
raw_df.loc[top_10_sim.index]

Unnamed: 0,Bios,Location,Interested in,Age,Movies,Music,Sports,Politics,Social Media
3751,Web expert. Explorer. Amateur internet guru. Subtly charming thinker. Zombie scholar. Avid bacon nerd. Communicator.,6,6,7,8,4,7,4,9
3185,Freelance zombie specialist. Twitter maven. Web scholar. Evil beer ninja. Food fanatic. Gamer. Hardcore music buff. Travel aficionado.,5,8,6,9,6,8,4,7
1797,Professional entrepreneur. Unapologetic communicator. Hardcore beer aficionado. Devoted student.,7,5,7,7,5,6,5,9
123,Beer enthusiast. Creator. Friendly bacon ninja. Extreme gamer. Total student. Wannabe travel geek. Unapologetic tv aficionado.,5,7,4,8,5,6,7,7
546,Communicator. Extreme writer. Tv nerd. Wannabe organizer. Amateur music maven. Analyst.,4,9,9,8,5,6,5,9
4230,Freelance travel ninja. Introvert. Gamer. Total creator. Tv geek. Thinker. Webaholic. Unapologetic alcohol fan.,5,6,2,6,6,8,5,7
1243,Coffee practitioner. Subtly charming alcohol scholar. Award-winning introvert. Internetaholic.,6,6,4,4,2,7,5,8
127,Coffee aficionado. Proud student. Explorer. Wannabe alcoholaholic. Devoted communicator.,8,7,9,9,4,7,6,5
3187,Analyst. Zombie lover. Devoted coffee junkie. Typical travel advocate. Food buff.,3,7,2,5,4,9,6,9
591,Communicator. Extreme writer. Tv nerd. Wannabe organizer. Amateur music maven. Analyst.,5,4,4,4,5,6,4,5


In [12]:
import joblib
# save the model to disk
filename = 'model.joblib'
joblib.dump(model, filename)

['model.joblib']