<a href="https://colab.research.google.com/github/IpshitaSingh/Vonder/blob/main/NewProfile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Creating New Profiles

In [2]:
# Loading the Libraries and Data
import pandas as pd
pd.set_option('display.max_colwidth', 500)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import _pickle as pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook as tqdm

# Loading in the original unclustered DF
with open("numuser_data.pkl",'rb') as fp:
    raw_df = pickle.load(fp)

# Loading in the clustered DF
with open("clustered_profiles.pkl",'rb') as fp:
    cluster_df = pickle.load(fp)

In [7]:
# Instantiating a new DF row to append later
new_profile = pd.DataFrame(columns=raw_df.columns)

# Adding random values for new data
for i in new_profile.columns[1:]:
    new_profile[i] = np.random.randint(0,10,1)

# Printing an user interface for inputting new values
print("Enter new profile information...\n\nExample Bio:\nBacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.")

# Asking for new profile data
new_profile['Bios'] = input("Enter bio: ")

# Indexing that new profile data
new_profile.index = [raw_df.index[-1] + 1]

Enter new profile information...

Example Bio:
Bacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.
Enter bio: dog lover. nature lover. food lover


In [9]:
# object = pd.read_pickle(r'numuser_data.pkl')
# object

In [10]:
# Appending the new data
new_cluster = raw_df.append(new_profile)

## Scaling

In [11]:
# Instantiating the Scaler
scaler = MinMaxScaler()

# Scaling the categories then replacing the old values
df = new_cluster[['Bios']].join(pd.DataFrame(scaler.fit_transform(new_cluster.drop('Bios', axis=1)), columns=new_cluster.columns[1:], index=new_cluster.index))

## Vectorizing

In [12]:
# Instantiating the Vectorizer
vectorizer = CountVectorizer()

# Fitting the vectorizer to the Bios
x = vectorizer.fit_transform(df['Bios'])

# Creating a new DF that contains the vectorized words
df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())

# Concating the words DF with the original DF
new_df = pd.concat([df, df_wrds], axis=1)

# Dropping the Bios because it is no longer needed in place of vectorization
new_df.drop('Bios', axis=1, inplace=True)

## PCA

In [13]:
# Importing the library
from sklearn.decomposition import PCA

# Instantiating PCA
pca = PCA()

# Fitting and Transforming the DF
df_pca = pca.fit_transform(new_df)

# Finding the exact number of features that explain at least 99% of the variance in the dataset
total_explained_variance = pca.explained_variance_ratio_.cumsum()
n_over_99 = len(total_explained_variance[total_explained_variance>=.99])
n_to_reach_99 = new_df.shape[1] - n_over_99

# Reducing the dataset to the number of features determined before
pca = PCA(n_components=n_to_reach_99)

# Fitting and transforming the dataset to the stated number of features
df_pca = pca.fit_transform(new_df)

## HAC

In [14]:
# Setting the amount of clusters to test out
cluster_cnt = [i for i in range(2, 20, 1)]

# Establishing empty lists to store the scores for the evaluation metrics
s_scores = []

db_scores = []

# Looping through different iterations for the number of clusters
for i in tqdm(cluster_cnt):
    
    # Clustering with different number of clusters
    hac = AgglomerativeClustering(n_clusters=i)
    
    hac.fit(df_pca)
    
    cluster_assignments = hac.labels_
    
    # Appending the scores to the empty lists    
    s_scores.append(silhouette_score(df_pca, cluster_assignments))
    
    db_scores.append(davies_bouldin_score(df_pca, cluster_assignments))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


  0%|          | 0/18 [00:00<?, ?it/s]

In [15]:
def cluster_eval(y, x):
    """
    Prints the scores of a set evaluation metric. Prints out the max and min values of the evaluation scores.
    """
    
    # Creating a DataFrame for returning the max and min scores for each cluster
    df = pd.DataFrame(columns=['Cluster Score'], index=[i for i in range(2, len(y)+2)])
    df['Cluster Score'] = y
    
    print('Max Value:\nCluster #', df[df['Cluster Score']==df['Cluster Score'].max()])
    print('\nMin Value:\nCluster #', df[df['Cluster Score']==df['Cluster Score'].min()])
    
    
print("\nThe Silhouette Coefficient Score (find max score):")
cluster_eval(s_scores, cluster_cnt)

print("\nThe Davies-Bouldin Score (find minimum score):")
cluster_eval(db_scores, cluster_cnt)


The Silhouette Coefficient Score (find max score):
Max Value:
Cluster #     Cluster Score
19       0.056955

Min Value:
Cluster #    Cluster Score
2        0.01493

The Davies-Bouldin Score (find minimum score):
Max Value:
Cluster #    Cluster Score
3       5.947207

Min Value:
Cluster #     Cluster Score
19       3.430041


## Running HAC


In [16]:
# Instantiating HAC
hac = AgglomerativeClustering(n_clusters=12)

# Fitting
hac.fit(df_pca)

# Getting cluster assignments
cluster_assignments = hac.labels_

# Unscaling the categories then replacing the scaled values
df = df[['Bios']].join(pd.DataFrame(scaler.inverse_transform(df.drop('Bios', axis=1)), columns=df.columns[1:], index=df.index))

# Assigning the clusters to each profile
df['Cluster #'] = cluster_assignments


## Finding the Exact Cluster for our New Profile
# Getting the Cluster # for the new profile
profile_cluster = df.loc[new_profile.index]['Cluster #'].values[0]

# Using the Cluster # to narrow down the DF
profile_df = df[df['Cluster #']==profile_cluster].drop('Cluster #', axis=1)


## Vectorizing
Finding Top 10 Profiles for Newly Created Profile

In [17]:
# Fitting the vectorizer to the Bios
cluster_x = vectorizer.fit_transform(profile_df['Bios'])

# Creating a new DF that contains the vectorized words
cluster_v = pd.DataFrame(cluster_x.toarray(), index=profile_df.index, columns=vectorizer.get_feature_names())

# Joining the Vectorized DF to the previous DF
profile_df = profile_df.join(cluster_v).drop('Bios', axis=1)


## Correlation
# Trasnposing the DF so that we are correlating with the index(users) and finding the correlation
corr = profile_df.T.corr()

# Finding the Top 10 similar or correlated users to the new user
user_n = new_profile.index[0]

# Creating a DF with the Top 10 most similar profiles
top_10_sim = corr[[user_n]].sort_values(by=[user_n],axis=0, ascending=False)[1:11]

# Displaying the Top 10
raw_df.loc[top_10_sim.index]

Unnamed: 0,Bios,Location,Interested in,Age,Movies,Music,Sports,Politics,Social Media
3729,Food specialist. Hardcore writer. Tv practitioner. Amateur explorer. Passionate gamer.,9,5,1,3,0,5,7,4
2063,Amateur tv junkie. Communicator. Music expert. Travel guru. Wannabe introvert. Internet practitioner.,8,6,1,0,4,3,8,6
2391,Typical travel expert. Unapologetic reader. Friendly food buff. Certified zombie nerd. Thinker.,7,3,2,1,3,6,8,5
4634,Explorer. Amateur food evangelist. Passionate tv fan. Typical social mediaholic. Beer fanatic.,8,3,2,0,0,1,6,4
253,Total bacon advocate. Devoted zombie scholar. Falls down a lot. Infuriatingly humble gamer. Social media expert.,9,2,0,1,0,6,9,3
2321,Coffee lover. Introvert. Food fanatic. Analyst. Bacon advocate. Extreme twitter fanatic.,7,4,3,5,0,2,7,5
2450,Twitter aficionado. Hardcore alcohol practitioner. Bacon expert. Typical web nerd. Creator.,9,7,0,0,6,3,9,6
396,Communicator. Extreme writer. Tv nerd. Wannabe organizer. Amateur music maven. Analyst.,7,2,4,0,1,2,9,5
4357,Lifelong coffee maven. Food fanatic. Subtly charming social mediaholic. Tv trailblazer.,8,4,2,0,0,5,8,1
3957,Zombie trailblazer. Beer evangelist. Internet enthusiast. General thinker. Passionate introvert.,8,9,5,3,3,4,8,8
