# Adding a New Dating Profile
Using Classification or Clustering for a New Dating Profile

### Importing Libraries and Data

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 500)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import _pickle as pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import calinski_harabasz_score, silhouette_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook as tqdm

#### Loading the Profiles

In [3]:
# Loading in the cleaned DF
with open("mechanic_profiles.pkl",'rb') as fp:
    raw_df = pickle.load(fp)

# Viewing the DF    
raw_df.head()

Unnamed: 0,Bios,Loation,Qualification,Specialisation
0,Passionate analyst. Explorer. Hipster-friendly problem solver. Freelance music geek. Social media advocate. Reader.,5,3,1
1,Twitter fanatic. Devoted web fanatic. Zombie evangelist. Travel aficionado. Bacon lover.,5,7,3
2,Total alcohol practitioner. Social media buff. Evil beer expert. Devoted analyst. Problem solver. Student.,2,0,2
3,Extreme twitter advocate. Hardcore internet junkie. Entrepreneur. Friend of animals everywhere.,4,6,6
4,Problem solver. Devoted introvert. Food geek. Avid writer. Thinker. Troublemaker. Friend of animals everywhere.,7,2,8


#### Loading the Clustered Profiles

In [4]:
# Loading in the clustered DF
with open("clustered_profiles.pkl",'rb') as fp:
    cluster_df = pickle.load(fp)

# Viewing the DF    
cluster_df.tail()

Unnamed: 0,Bios,Loation,Qualification,Specialisation,Cluster #
6160,Lifelong zombie junkie. Friendly travel buff. Coffeeaholic. Internet enthusiast.,7.0,5.0,3.0,1
6161,Total introvert. Tv specialist. Pop culture ninja. Web lover. Subtly charming twitter advocate. Explorer.,3.0,1.0,8.0,1
6162,Friendly zombie specialist. Avid bacon expert. Tv junkie. Alcohol guru. Food aficionado.,6.0,7.0,6.0,2
6163,Wannabe coffee practitioner. Troublemaker. Communicator. Friendly travel advocate. Reader. Thinker.,7.0,7.0,3.0,4
6164,Lifelong travel expert. Evil gamer. Infuriatingly humble introvert. Devoted web junkie. Typical internet practitioner. Passionate alcohol buff.,9.0,1.0,0.0,3


## Creating the New Profile Data

In [5]:
# Instantiating a new DF row to append later
new_profile = pd.DataFrame(columns=raw_df.columns)

# Adding random values for new data
for i in new_profile.columns[1:]:
    new_profile[i] = np.random.randint(0,10,1)

# Printing an user interface for inputting new values
print("Enter new profile information...\n\nExample Bio:\nBacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.")

# Asking for new profile data
new_profile['Bios'] = input("Enter a Bio for yourself: ")

# Indexing that new profile data
new_profile.index = [raw_df.index[-1] + 1]

Enter new profile information...

Example Bio:
Bacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.


### The New Data

In [6]:
new_profile

Unnamed: 0,Bios,Loation,Qualification,Specialisation
6165,i need some help with heavy vechicle,7,7,7


# Two Approaches
1. Cluster all the profiles again with the new profile

2. Classify the new profile with a classification model trained on our previously clustered data

## Clustering the New Profile Data

In [7]:
# Appending the new data
new_cluster = raw_df.append(new_profile)

### Scaling

In [8]:
# Instantiating the Scaler
scaler = MinMaxScaler()

# Scaling the categories then replacing the old values
df = new_cluster[['Bios']].join(pd.DataFrame(scaler.fit_transform(new_cluster.drop('Bios', axis=1)), columns=new_cluster.columns[1:], index=new_cluster.index))

### Vectorizing

In [9]:
# Instantiating the Vectorizer
vectorizer = CountVectorizer()

# Fitting the vectorizer to the Bios
x = vectorizer.fit_transform(df['Bios'])

# Creating a new DF that contains the vectorized words
df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())

# Concating the words DF with the original DF
new_df = pd.concat([df, df_wrds], axis=1)

# Dropping the Bios because it is no longer needed in place of vectorization
new_df.drop('Bios', axis=1, inplace=True)

### PCA

In [10]:
from sklearn.decomposition import PCA

# Instantiating PCA
pca = PCA()

# Fitting and Transforming the DF
df_pca = pca.fit_transform(new_df)

# Finding the exact number of features that explain at least 99% of the variance in the dataset
total_explained_variance = pca.explained_variance_ratio_.cumsum()
n_over_99 = len(total_explained_variance[total_explained_variance>=.99])
n_to_reach_99 = new_df.shape[1] - n_over_99

# Reducing the dataset to the number of features determined before
pca = PCA(n_components=n_to_reach_99)

# Fitting and transforming the dataset to the stated number of features
df_pca = pca.fit_transform(new_df)

# Seeing the variance ratio that still remains after the dataset has been reduced
pca.explained_variance_ratio_.cumsum()[-1]

0.9896490759603953

### Performing Hierarchical Agglomerative Clustering
- First finding the optimum number of clusters

In [12]:
# Setting the amount of clusters to test out
cluster_cnt = [i for i in range(2, 20, 1)]

# Establishing empty lists to store the scores for the evaluation metrics
ch_scores = []

s_scores = []

db_scores = []

# Looping through different iterations for the number of clusters
for i in tqdm(cluster_cnt):
    
    # Clustering with different number of clusters
    hac = AgglomerativeClustering(n_clusters=i)
    
    hac.fit(df_pca)
    
    cluster_assignments = hac.labels_
    
    # Appending the scores to the empty lists
    ch_scores.append(calinski_harabasz_score(df_pca, cluster_assignments))
    
    s_scores.append(silhouette_score(df_pca, cluster_assignments))
    
    db_scores.append(davies_bouldin_score(df_pca, cluster_assignments))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(cluster_cnt):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=18.0), HTML(value='')))




### Helper Function to Evaluate the Clusters

In [13]:
def cluster_eval(y, x):
    """
    Prints the scores of a set evaluation metric. Prints out the max and min values of the evaluation scores.
    """
    
    # Creating a DataFrame for returning the max and min scores for each cluster
    df = pd.DataFrame(columns=['Cluster Score'], index=[i for i in range(2, len(y)+2)])
    df['Cluster Score'] = y
    
    print('Max Value:\nCluster #', df[df['Cluster Score']==df['Cluster Score'].max()])
    print('\nMin Value:\nCluster #', df[df['Cluster Score']==df['Cluster Score'].min()])

### Evaluation of Clusters

In [14]:
print("The Calinski-Harabasz Score (find max score):")
cluster_eval(ch_scores, cluster_cnt)

print("\nThe Silhouette Coefficient Score (find max score):")
cluster_eval(s_scores, cluster_cnt)

print("\nThe Davies-Bouldin Score (find minimum score):")
cluster_eval(db_scores, cluster_cnt)

The Calinski-Harabasz Score (find max score):
Max Value:
Cluster #    Cluster Score
3     143.675265

Min Value:
Cluster #     Cluster Score
19      78.497954

The Silhouette Coefficient Score (find max score):
Max Value:
Cluster #    Cluster Score
2       0.069948

Min Value:
Cluster #    Cluster Score
3       0.027911

The Davies-Bouldin Score (find minimum score):
Max Value:
Cluster #    Cluster Score
3       4.520621

Min Value:
Cluster #    Cluster Score
2       3.263941


### Running HAC
Again but with the optimum cluster count

In [15]:
# Instantiating HAC
hac = AgglomerativeClustering(n_clusters=12)

# Fitting
hac.fit(df_pca)

# Getting cluster assignments
cluster_assignments = hac.labels_

# Unscaling the categories then replacing the scaled values
df = df[['Bios']].join(pd.DataFrame(scaler.inverse_transform(df.drop('Bios', axis=1)), columns=df.columns[1:], index=df.index))

# Assigning the clusters to each profile
df['Cluster #'] = cluster_assignments


### Finding the Exact Cluster for our New Profile

In [16]:
# Getting the Cluster # for the new profile
profile_cluster = df.loc[new_profile.index]['Cluster #'].values[0]

# Using the Cluster # to narrow down the DF
profile_df = df[df['Cluster #']==profile_cluster].drop('Cluster #', axis=1)

### Vectorizing the Selected Cluster

In [17]:
# Fitting the vectorizer to the Bios
cluster_x = vectorizer.fit_transform(profile_df['Bios'])

# Creating a new DF that contains the vectorized words
cluster_v = pd.DataFrame(cluster_x.toarray(), index=profile_df.index, columns=vectorizer.get_feature_names())

# Joining the Vectorized DF to the previous DF
profile_df = profile_df.join(cluster_v).drop('Bios', axis=1)

### Finding Correlation for Top 10 Similar Profiles to the New Profile

In [18]:
# Trasnposing the DF so that we are correlating with the index(users) and finding the correlation
corr = profile_df.T.corr()

# Finding the Top 10 similar or correlated users to the new user
user_n = new_profile.index[0]

# Creating a DF with the Top 10 most similar profiles
top_10_sim = corr[[user_n]].sort_values(by=[user_n],axis=0, ascending=False)[1:11]

### The Top 10 Profiles most likely to Match with the New Profile
(Sorted by descending similarity)

In [19]:
raw_df.loc[top_10_sim.index]

Unnamed: 0,Bios,Loation,Qualification,Specialisation
3149,Freelance bacon expert. Music enthusiast. Unapologetic internetaholic.,8,8,9
2231,Writer. Devoted travel advocate. Zombie fan. Internet evangelist.,7,9,7
3179,Freelance bacon expert. Music enthusiast. Unapologetic internetaholic.,6,6,7
2189,Introvert. Unapologetic bacon trailblazer. Devoted internet expert. Award-winning music advocate.,7,9,9
3695,Freelance analyst. Troublemaker. General tv aficionado. Lifelong coffee guru. Avid communicator.,7,8,9
1938,Explorer. Devoted tv enthusiast. Student. Award-winning alcohol nerd. Evil web fan. Internet lover.,8,8,9
2857,Troublemaker. Explorer. Freelance travel guru. Coffee buff. Internet maven. Alcohol expert.,8,7,7
6016,Evil creator. Travel maven. Award-winning internet expert. Gamer. Troublemaker. Incurable student. Entrepreneur. Webaholic.,9,7,9
2815,Thinker. Professional reader. Webaholic. Award-winning bacon advocate. Hardcore food ninja. Freelance travel fan.,8,8,7
5554,Award-winning music evangelist. Communicator. Subtly charming troublemaker. Food guru. Infuriatingly humble web ninja.,9,8,7


## Classification of the New Profile

### Importing the Different Classification Models

In [20]:
# Importing 3 models
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

### Vectorizing the Data

In [21]:
# Assigning the split variables
X = cluster_df.drop(["Cluster #"], 1)
y = cluster_df['Cluster #']

## Vectorizing
# Instantiating the Vectorizer
vectorizer = CountVectorizer()

# Fitting the vectorizer to the Bios
x = vectorizer.fit_transform(X['Bios'])

# Creating a new DF that contains the vectorized words
df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())

# Concating the words DF with the original DF
X = pd.concat([X, df_wrds], axis=1)

# Dropping the Bios because it is no longer needed in place of vectorization
X.drop(['Bios'], axis=1, inplace=True)

#### Scaling

In [22]:
# Scaling the Data
scaler = MinMaxScaler()

X = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)

### Preparing the New Profile Data
For Vectorization purposes, the new profile will have to be able to fit into trained data (has to have the same columns).

Two Options:
1. __Vectorized the New Profile data with the vectorizer fitted to the dataset as to not include potentially new vocabulary. _(Keeps dimensionality the same)___
2. Vectorized the New Profile data with a new vectorizer fitted to it in order to include new vocabulary. _(Increases dimensionality with every new piece of data)_

#### Vectorizing

In [23]:
# Vectorizing the new data
vect_new_prof = vectorizer.transform(new_profile['Bios'])

# Quick DF of the vectorized words
new_vect_w = pd.DataFrame(vect_new_prof.toarray(), columns=vectorizer.get_feature_names(), index=new_profile.index)

# Concatenating the DFs for the new profile data
new_vect_prof = pd.concat([new_profile, new_vect_w], 1).drop('Bios', 1)

# Scaling the new profile data
new_vect_prof = pd.DataFrame(scaler.transform(new_vect_prof), columns=new_vect_prof.columns, index=new_vect_prof.index)

In [24]:
new_vect_prof

Unnamed: 0,Loation,Qualification,Specialisation,advocate,aficionado,alcohol,alcoholaholic,amateur,analyst,animals,...,unable,unapologetic,wannabe,web,webaholic,winning,with,writer,zombie,zombieaholic
6165,0.777778,0.777778,0.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Train, test, splitting

In [25]:
# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Finding the Best Model
- Dummy (Baseline Model)
- KNN
- SVM

In [26]:
# Dummy
dummy = DummyClassifier(strategy='stratified')

# KNN
knn = KNeighborsClassifier()

# SVM
svm = SVC()

# List of models
models = [dummy, knn, svm]

# List of model names
names = ['Dummy', 'KNN', 'SVM']

# Zipping the lists
classifiers = dict(zip(names, models))

Since we are dealing with an imbalanced dataset _(because each cluster is not guaranteed to have the same amount of profiles)_, we will resort to using the __Macro Avg__ and __F1 Score__ for evaluating the performances of each model.

In [27]:
# Dictionary containing the model names and their scores
models_f1 = {}

# Looping through each model's predictions and getting their classification reports
for name, model in classifiers.items():
    # Fitting the model
    model.fit(X_train, y_train)
    
    print('\n'+ name + ' (Macro Avg - F1 Score):')
    
    # Classification Report
    report = classification_report(y_test, model.predict(X_test), output_dict=True)
    f1 = report['macro avg']['f1-score']
    
    # Assigning to the Dictionary
    models_f1[name] = f1
    
    print(f1)


Dummy (Macro Avg - F1 Score):
0.08882137923883537

KNN (Macro Avg - F1 Score):
1.0

SVM (Macro Avg - F1 Score):
1.0


#### Model with the Best Performance

In [28]:
print(max(models_f1, key=models_f1.get), 'Score:', max(models_f1.values()))

KNN Score: 1.0


### Using the Best Model to Classify the New Profile
_(Optional: Tune the model with GridSearch)_

In [29]:
# Fitting the model
svm.fit(X, y)

# Predicting the New Profile data by determining which Cluster it would belong to
designated_cluster = svm.predict(new_vect_prof)

designated_cluster

array([4], dtype=int64)

### DF containing the Profiles of the Designated Cluster

In [30]:
des_cluster = cluster_df[cluster_df['Cluster #']==designated_cluster[0]]

des_cluster

Unnamed: 0,Bios,Loation,Qualification,Specialisation,Cluster #
7,Certified web evangelist. Proud bacon trailblazer. Travel aficionado. Alcohol scholar.,7.0,9.0,1.0,4
9,Hipster-friendly musicaholic. Wannabe tv fanatic. Certified gamer. Amateur coffee specialist.,9.0,4.0,3.0,4
10,Total entrepreneur. Proud web fanatic. Typical beer scholar. Student. Lifelong explorer. Tv maven.,3.0,4.0,3.0,4
11,Music fan. Beer geek. Web lover. Falls down a lot. Coffee nerd. Travel junkie.,1.0,4.0,3.0,4
12,Falls down a lot. Typical beer guru. Creator. Subtly charming alcohol enthusiast. Incurable tv buff. Reader.,5.0,7.0,0.0,4
...,...,...,...,...,...
6133,Wannabe coffee practitioner. Troublemaker. Communicator. Friendly travel advocate. Reader. Thinker.,0.0,7.0,6.0,4
6143,Writer. Total coffee scholar. Travel lover. Thinker. Troublemaker. Hardcore bacon practitioner.,1.0,8.0,7.0,4
6148,Wannabe coffee practitioner. Troublemaker. Communicator. Friendly travel advocate. Reader. Thinker.,1.0,8.0,9.0,4
6158,Writer. Total coffee scholar. Travel lover. Thinker. Troublemaker. Hardcore bacon practitioner.,6.0,1.0,8.0,4


### Finding the Top 10 Similar Profiles to our New Profile

In [31]:
# Appending the new profile data
des_cluster = des_cluster.append(new_profile, sort=False)

# Fitting the vectorizer to the Bios
cluster_x = vectorizer.fit_transform(des_cluster['Bios'])

# Creating a new DF that contains the vectorized words
cluster_v = pd.DataFrame(cluster_x.toarray(), index=des_cluster.index, columns=vectorizer.get_feature_names())

# Joining the Vectorized DF to the previous DF and dropping columns
des_cluster = des_cluster.join(cluster_v).drop(['Bios', 'Cluster #'], axis=1)

des_cluster

Unnamed: 0,Loation,Qualification,Specialisation,advocate,aficionado,alcohol,alcoholaholic,amateur,analyst,avid,...,tvaholic,twitter,typical,unapologetic,vechicle,wannabe,web,with,writer,zombie
7,7.0,9.0,1.0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9,9.0,4.0,3.0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
10,3.0,4.0,3.0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
11,1.0,4.0,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
12,5.0,7.0,0.0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6143,1.0,8.0,7.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6148,1.0,8.0,9.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6158,6.0,1.0,8.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6163,7.0,7.0,3.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


#### Correlations to find similar profiles

In [32]:
# Finding the Top 10 similar or correlated users to the new user
user_n = new_profile.index[0]

# Trasnposing the DF so that we are correlating with the index(users) and finding the correlation
corr = des_cluster.T.corrwith(des_cluster.loc[user_n])

# Creating a DF with the Top 10 most similar profiles
top_10_sim = corr.sort_values(ascending=False)[1:11]

### Top 10 Similar profiles

In [33]:
raw_df.loc[top_10_sim.index]

Unnamed: 0,Bios,Loation,Qualification,Specialisation
3454,Coffee geek. Alcoholaholic. Extreme internet fan. Twitter lover.,9,8,8
5754,Total entrepreneur. Evil zombie enthusiast. Troublemaker. Avid food advocate.,9,9,8
3875,General tv fanatic. Incurable bacon aficionado. Unapologetic beer trailblazer. Food junkie.,9,9,9
4881,Avid explorer. Lifelong beer specialist. Incurable tv geek. Thinker. Communicator.,8,9,9
2139,Coffee evangelist. Certified tv scholar. Web fanatic. Beer lover. Analyst.,8,9,8
5583,Passionate introvert. Internet specialist. Proud alcohol ninja. Typical food nerd. Entrepreneur.,9,9,8
3935,General tv fanatic. Incurable bacon aficionado. Unapologetic beer trailblazer. Food junkie.,8,9,9
5193,Certified music trailblazer. Writer. Explorer. General entrepreneur. Zombie specialist.,8,8,7
5133,Certified music trailblazer. Writer. Explorer. General entrepreneur. Zombie specialist.,8,7,7
5223,Certified music trailblazer. Writer. Explorer. General entrepreneur. Zombie specialist.,8,7,7


### Saving the Classification Model
For future use

In [34]:
from joblib import dump

dump(svm, "clf_model.joblib")

['clf_model.joblib']

## Conclusion on the Two Different Approaches
The results for both approaches are the same.  The new profile ends up in the same cluster whether it is clustered or classified to be there.