# Importing required libraries

In [60]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import MinMaxScaler

# Data Extraction and pre-processing for social media users

In [61]:
usersdf=pd.read_csv("SocialMediaUsers.csv")

In [62]:
usersdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   UserID     100000 non-null  int64 
 1   Name       100000 non-null  object
 2   Gender     100000 non-null  object
 3   DOB        100000 non-null  object
 4   Interests  100000 non-null  object
 5   City       100000 non-null  object
 6   Country    100000 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.3+ MB


dropping irrelevant columns 

In [63]:
usersdf.drop(columns=["Gender","UserID","City","DOB",],inplace=True)

combining interests and country for text analyis

In [64]:
usersdf["keywords"]=usersdf["Interests"]+","+usersdf["Country"]

In [65]:
usersdf["keywords"]=usersdf["keywords"].str.replace("'","")

In [66]:
usersdf.drop(columns=["Interests","Country"],inplace=True)

In [67]:
usersdf.head()

Unnamed: 0,Name,keywords
0,Jesse Lawhorn,"Movies, Fashion, Fashion, Books,Indonesia"
1,Stacy Payne,"Gaming, Finance and investments, Outdoor activ..."
2,Katrina Nicewander,"DIY and crafts, Music, Science, Fashion,Jordan"
3,Eric Yarbrough,"Outdoor activities, Cars and automobiles,Italy"
4,Daniel Adkins,"Politics, History,Venezuela"


# data extraction and pre-processing for social media influencers

In [68]:
influencer_df=pd.read_csv("influencers.csv")

In [69]:
influencer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050 entries, 0 to 1049
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   S.no           1050 non-null   int64 
 1   username       1050 non-null   object
 2   followers      1050 non-null   object
 3   country        1046 non-null   object
 4   viewers        1050 non-null   object
 5   Average views  1050 non-null   object
 6   Category_1     1050 non-null   object
 7   Category_2     1050 non-null   object
 8   Category_3     1050 non-null   object
dtypes: int64(1), object(8)
memory usage: 74.0+ KB


In [71]:
influencer_df.head()

Unnamed: 0,S.no,username,followers,country,viewers,Average views,Category_1,Category_2,Category_3
0,1,cristiano,477.9M,India,5M,6.2M,'Parenting and family','Art','History'
1,2,kyliejenner,368.1M,United States,3.5M,5.5M,'Finance and investments','Travel','Pets'
2,3,arianagrande,329.6M,United States,2.9M,4M,'Science','Movies','Beauty'
3,4,leomessi,358.6M,Indonesia,2.7M,3.5M,'Cars and automobiles','Beauty','Health and wellness'
4,5,zendaya,151.1M,United States,4.3M,5.8M,'Parenting and family','Beauty','Art'


**Data Transformation:** 

1: converting values of viewers, Average views and followers to a quantifiable figure

2: combining 'country', 'Category_1', 'Category_2', 'Category_3' for text analysis. 

In [72]:
influencer_df["keywords"]=influencer_df["country"]+","+influencer_df["Category_1"]+","+influencer_df["Category_2"]+","+influencer_df["Category_3"]
influencer_df["keywords"]=influencer_df["keywords"].str.replace("'","")
influencer_df.drop(columns=["country","Category_1","Category_2","Category_3"],inplace=True)

In [73]:
def convert_to_thousands(value):
    if value.endswith('K'):
        return float(value[:-1])
    elif value.endswith('M'):
        return float(value[:-1]) * 1000
    else:
        return float(value)

In [74]:
influencer_df['followers'] = influencer_df['followers'].apply(convert_to_thousands)

In [75]:
influencer_df['viewers'] = influencer_df['viewers'].apply(convert_to_thousands)

In [76]:
influencer_df['Average views'] = influencer_df['Average views'].apply(convert_to_thousands)

In [77]:
influencer_df.rename(columns={"followers":"Followers(in_K)","viewers":"Viewers(in_K)","Average views":"Average_views(in_K)"},inplace=True)

In [78]:
influencer_df.drop(index=influencer_df[influencer_df.username.duplicated(keep="first")==True].index,inplace=True)

In [79]:
influencer_df.head()

Unnamed: 0,S.no,username,Followers(in_K),Viewers(in_K),Average_views(in_K),keywords
0,1,cristiano,477900.0,5000.0,6200.0,"India,Parenting and family,Art,History"
1,2,kyliejenner,368100.0,3500.0,5500.0,"United States,Finance and investments,Travel,Pets"
2,3,arianagrande,329600.0,2900.0,4000.0,"United States,Science,Movies,Beauty"
3,4,leomessi,358600.0,2700.0,3500.0,"Indonesia,Cars and automobiles,Beauty,Health a..."
4,5,zendaya,151100.0,4300.0,5800.0,"United States,Parenting and family,Beauty,Art"


In [85]:
influencer_df.describe()

Unnamed: 0,S.no,Followers(in_K),Viewers(in_K),Average_views(in_K)
count,983.0,983.0,983.0,983.0
mean,499.533062,27059.104781,574.240895,757.683215
std,289.748966,45371.6719,968.373386,1081.645868
min,1.0,1900.0,0.0,34.2
25%,246.5,8950.0,175.45,255.7
50%,502.0,14900.0,323.4,457.4
75%,751.5,27300.0,600.3,819.25
max,1000.0,546600.0,12700.0,12700.0


**data normalization:** 
count of followers, viewers and average views will contribute to the chances of a movie being recommended.  

In [80]:
cols=["Followers(in_K)","Viewers(in_K)","Average_views(in_K)"]
data=influencer_df[cols]
scaler = MinMaxScaler(feature_range=(0,100))
normalized_data = scaler.fit_transform(data)

Tf*Idf algorithm is being used to analyze the keywords for comparison

In [81]:
tfidf=TfidfVectorizer(stop_words="english")
influencer_df["keywords"]=influencer_df["keywords"].fillna("")
influencer_tfidf=tfidf.fit_transform(influencer_df.keywords)
users_tfidf=tfidf.transform(usersdf["keywords"])

cosine similarity score determines how similar are interests of users are with the type of content created by influencers.

In [82]:
cosine_sim=linear_kernel(influencer_tfidf,users_tfidf)

Here is the implementation of the function to give recommendations to the target. '**username**' is the name of target user.

In [83]:
def get_recommendations(username, influencer_df, usersdf, cosine_sim, normalized_data):
    user_index = usersdf[usersdf['Name'] == username].index[0] #index of target user
    user_cosine_sim = cosine_sim[:, user_index] #compares similarity of interests of 
    #target to content provided by creator
    combined_values = user_cosine_sim + normalized_data.sum(axis=1)#combines the above comparison 
    #score with followers, viewers and average views
    sim_scores = sorted(enumerate(combined_values), key=lambda x: x[1], reverse=True)[:10]
    sim_indices = [i[0] for i in sim_scores]
    print(influencer_df["username"].iloc[sim_indices])

get_recommendations('Jesse Lawhorn', influencer_df, usersdf, cosine_sim, normalized_data)

7                thv
12               j.m
0          cristiano
17            agustd
19               jin
1        kyliejenner
24         uarmyhope
11    tomholland2013
2       arianagrande
3           leomessi
Name: username, dtype: object
