In [29]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import MinMaxScaler

In [30]:
influencer_df=pd.read_csv("influencers.csv")

In [31]:
usersdf=pd.read_csv("SocialMediaUsers.csv")

In [32]:
usersdf.drop(columns=["Gender","UserID","City","DOB",],inplace=True)

In [33]:
usersdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Name       100000 non-null  object
 1   Interests  100000 non-null  object
 2   Country    100000 non-null  object
dtypes: object(3)
memory usage: 2.3+ MB


In [34]:
usersdf["keywords"]=usersdf["Interests"]+","+usersdf["Country"]

In [35]:
usersdf["keywords"]=usersdf["keywords"].str.replace("'","")

In [36]:
usersdf.drop(columns=["Interests","Country"],inplace=True)

In [37]:
usersdf

Unnamed: 0,Name,keywords
0,Jesse Lawhorn,"Movies, Fashion, Fashion, Books,Indonesia"
1,Stacy Payne,"Gaming, Finance and investments, Outdoor activ..."
2,Katrina Nicewander,"DIY and crafts, Music, Science, Fashion,Jordan"
3,Eric Yarbrough,"Outdoor activities, Cars and automobiles,Italy"
4,Daniel Adkins,"Politics, History,Venezuela"
...,...,...
99995,Lionel Denault,"DIY and crafts,China"
99996,Margie Mieszala,"Cars and automobiles, Cooking, Outdoor activit..."
99997,Joan Mercedes,"Business and entrepreneurship, Cooking,Chile"
99998,Marvin Massa,"Gaming, Business and entrepreneurship, Fashion..."


In [38]:
influencer_df["keywords"]=influencer_df["country"]+","+influencer_df["Category_1"]+","+influencer_df["Category_2"]+","+influencer_df["Category_3"]

In [39]:
influencer_df["keywords"]=influencer_df["keywords"].str.replace("'","")

In [40]:
influencer_df.drop(columns=["country","Category_1","Category_2","Category_3"],inplace=True)

In [41]:
influencer_df

Unnamed: 0,S.no,username,followers,viewers,Average views,keywords
0,1,cristiano,477.9M,5M,6.2M,"India,Parenting and family,Art,History"
1,2,kyliejenner,368.1M,3.5M,5.5M,"United States,Finance and investments,Travel,Pets"
2,3,arianagrande,329.6M,2.9M,4M,"United States,Science,Movies,Beauty"
3,4,leomessi,358.6M,2.7M,3.5M,"Indonesia,Cars and automobiles,Beauty,Health a..."
4,5,zendaya,151.1M,4.3M,5.8M,"United States,Parenting and family,Beauty,Art"
...,...,...,...,...,...,...
1045,996,jadethirlwall,9.4M,174.2K,228.1K,"United States,Science,Cooking,Gardening"
1046,997,ninja,12.9M,127.8K,163.9K,"United States,Gaming,Travel,DIY and crafts"
1047,998,myriamfares,20.6M,75.8K,102.4K,"Iraq,Photography,Science,Technology"
1048,999,optimushwang,4.2M,426K,508K,"South Korea,Technology,Science,Social causes a..."


In [42]:
def convert_to_thousands(value):
    if value.endswith('K'):
        return float(value[:-1]) 
    elif value.endswith('M'):
        return float(value[:-1]) * 1000
    else:
        return float(value)

In [43]:
influencer_df['followers'] = influencer_df['followers'].apply(convert_to_thousands)

In [44]:
influencer_df['viewers'] = influencer_df['viewers'].apply(convert_to_thousands)

In [45]:
influencer_df['Average views'] = influencer_df['Average views'].apply(convert_to_thousands)

In [46]:
influencer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050 entries, 0 to 1049
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   S.no           1050 non-null   int64  
 1   username       1050 non-null   object 
 2   followers      1050 non-null   float64
 3   viewers        1050 non-null   float64
 4   Average views  1050 non-null   float64
 5   keywords       1046 non-null   object 
dtypes: float64(3), int64(1), object(2)
memory usage: 49.3+ KB


In [47]:
usersdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Name      100000 non-null  object
 1   keywords  100000 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [48]:
influencer_df.rename(columns={"followers":"Followers(in_K)","viewers":"Viewers(in_K)","Average views":"Average_views(in_K)"},inplace=True)

In [49]:
influencer_df.drop(index=influencer_df[influencer_df.username.duplicated(keep="first")==True].index,inplace=True)

In [50]:
influencer_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 983 entries, 0 to 999
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   S.no                 983 non-null    int64  
 1   username             983 non-null    object 
 2   Followers(in_K)      983 non-null    float64
 3   Viewers(in_K)        983 non-null    float64
 4   Average_views(in_K)  983 non-null    float64
 5   keywords             979 non-null    object 
dtypes: float64(3), int64(1), object(2)
memory usage: 53.8+ KB


In [51]:
cols=["Followers(in_K)","Viewers(in_K)","Average_views(in_K)"]
data=influencer_df[cols]
scaler = MinMaxScaler(feature_range=(0,100))
normalized_data = scaler.fit_transform(data)

In [52]:
tfidf=TfidfVectorizer(stop_words="english")
influencer_df["keywords"]=influencer_df["keywords"].fillna("")
influencer_tfidf=tfidf.fit_transform(influencer_df.keywords)
users_tfidf=tfidf.fit_transform(usersdf.keywords)

In [55]:
cosine_sim=linear_kernel(influencer_tfidf,influencer_tfidf)

In [56]:
indices=pd.Series(influencer_df.index, index=influencer_df["username"])

In [58]:
def get_recommendations(username, cosine_sim=cosine_sim, normalized_data=normalized_data):
    idx = indices[username]
    user_normalized_data = normalized_data[idx]
    # Broadcast the user's normalized data to match the shape of cosine_sim
    user_normalized_data_broadcasted = np.tile(user_normalized_data, (len(cosine_sim), 1))
    
    # Combine cosine_sim and normalized_data
    combined_values = cosine_sim[idx] + user_normalized_data_broadcasted.sum(axis=1)
    
    # Enumerate and sort the combined values
    sim_scores = sorted(enumerate(combined_values), key=lambda x: x[1], reverse=True)
    
    # Get top 10 similar indices
    sim_scores = sim_scores[1:11]
    sim_indices = [i[0] for i in sim_scores]
    
    # Print recommended usernames
    print(influencer_df["username"].iloc[sim_indices],"\n",sim_scores)

get_recommendations('jokowi')


244    prattprattpratt
812          kaliuchis
546        sav.labrant
79          niallhoran
511         jackharlow
127             emrata
510         kit.connor
333         aron.piper
436        amandacerny
956       camilacoelho
Name: username, dtype: object 
 [(244, 7.5423965472120456), (798, 7.539722736726353), (536, 7.526946275212261), (79, 7.482455110395112), (501, 7.46789502345965), (127, 7.442938404526654), (500, 7.383418638514033), (333, 7.3684733687201), (431, 7.356750888983345), (942, 7.347741837328357)]
