## importing necessary Libraries and dataset

In [18]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
influencer_df=pd.read_csv("influencers.csv") # Content Data

In [21]:
influencer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   S.no           1000 non-null   int64 
 1   username       1000 non-null   object
 2   followers      1000 non-null   object
 3   Country        996 non-null    object
 4   viewers        1000 non-null   object
 5   Average views  1000 non-null   object
 6   Categories     1000 non-null   object
dtypes: int64(1), object(6)
memory usage: 54.8+ KB


In [22]:
influencer_df.head()

Unnamed: 0,S.no,username,followers,Country,viewers,Average views,Categories
0,1,cristiano,477.9M,India,5M,6.2M,"'Parenting and family', 'Art', 'History'"
1,2,kyliejenner,368.1M,United States,3.5M,5.5M,"'Finance and investments', 'Travel', 'Pets'"
2,3,arianagrande,329.6M,United States,2.9M,4M,"'Science', 'Movies', 'Beauty'"
3,4,leomessi,358.6M,Indonesia,2.7M,3.5M,"'Cars and automobiles', 'Beauty', 'Health and ..."
4,5,zendaya,151.1M,United States,4.3M,5.8M,"'Parenting and family', 'Beauty', 'Art'"


## Data Preprocessing for influencer data

**Dealing with duplicate data**

In [23]:
print(f'{influencer_df.duplicated().sum()} duplicates detected')

0 duplicates detected


**Encoding text labels**

In [24]:
influencer_df["Categories"]=influencer_df["Categories"].apply(lambda x: x.split(", "))

In [25]:
mlb = MultiLabelBinarizer()
OHE_df = pd.DataFrame(mlb.fit_transform(influencer_df['Categories']), 
                      columns=[col.replace("'", "").replace('"', '') for col in mlb.classes_]
                     )

In [28]:
influencer_df.drop(columns=['Categories','S.no'], inplace=True)
influencer_df = pd.concat([influencer_df, OHE_df], axis=1)

**Encoding engagement data**

In [29]:
def unit_stdzer(value):
    if value.endswith('K'):
        return float(value[:-1])
    elif value.endswith('M'):
        return float(value[:-1]) * 1000
    else:
        return float(value)

In [30]:
num_cols=['followers', 'Average views','viewers']
for col in num_cols:
    influencer_df[col] = influencer_df[col].apply(unit_stdzer)

In [31]:
scaler = StandardScaler()
engagement_data = pd.DataFrame(scaler.fit_transform(influencer_df[num_cols]),
                               columns=num_cols)

In [32]:
engagement_data

Unnamed: 0,followers,Average views,viewers
0,10.023948,5.077109,4.612757
1,7.583637,4.424813,3.051233
2,6.727972,3.027035,2.426624
3,7.372499,2.561110,2.218421
4,2.760799,4.704368,3.884046
...,...,...,...
995,-0.388492,-0.487816,-0.410977
996,-0.310704,-0.547641,-0.459280
997,-0.139571,-0.604950,-0.513413
998,-0.504062,-0.226991,-0.148850


In [33]:
influencer_df.drop(columns=num_cols, inplace=True)

**Encoding demographic data**

In [34]:
# Treating null values
influencer_df['Country']=influencer_df['Country'].replace({None:influencer_df['Country'].mode()[0]})

In [35]:
# label encoding for multiclass data
LE=LabelEncoder()
influencer_df['Country']=LE.fit_transform(influencer_df['Country'])

In [36]:
#storing the mappings for later use with data preprocessing of user data
country_mapping = dict(zip(LE.classes_, LE.transform(LE.classes_)))

In [37]:
influencer_df.head(10)

Unnamed: 0,username,Country,Art,Beauty,Books,Business and entrepreneurship,Cars and automobiles,Cooking,DIY and crafts,Education and learning,...,Outdoor activities,Parenting and family,Pets,Photography,Politics,Science,Social causes and activism,Sports,Technology,Travel
0,cristiano,10,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,kyliejenner,32,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,arianagrande,32,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,leomessi,11,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,zendaya,32,1,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
5,selenagomez,32,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,taylorswift,3,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,thv,25,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
8,virat.kohli,10,0,0,0,0,0,0,1,0,...,0,0,1,1,0,0,0,0,0,0
9,beyonce,32,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0


In [38]:
influencer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 31 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   username                       1000 non-null   object
 1   Country                        1000 non-null   int64 
 2   Art                            1000 non-null   int64 
 3   Beauty                         1000 non-null   int64 
 4   Books                          1000 non-null   int64 
 5   Business and entrepreneurship  1000 non-null   int64 
 6   Cars and automobiles           1000 non-null   int64 
 7   Cooking                        1000 non-null   int64 
 8   DIY and crafts                 1000 non-null   int64 
 9   Education and learning         1000 non-null   int64 
 10  Fashion                        1000 non-null   int64 
 11  Finance and investments        1000 non-null   int64 
 12  Fitness                        1000 non-null   int64 
 13  Food

# Data Preprocessing users data

In [58]:
usersdf=pd.read_csv("SocialMediaUsers.csv") # User Data 

In [59]:
usersdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   UserID     100000 non-null  int64 
 1   Name       100000 non-null  object
 2   Gender     100000 non-null  object
 3   DOB        100000 non-null  object
 4   Interests  100000 non-null  object
 5   City       100000 non-null  object
 6   Country    100000 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.3+ MB


In [60]:
usersdf['Interests']=usersdf['Interests'].apply(lambda x: x.split(", "))

In [61]:
usersdf.head()

Unnamed: 0,UserID,Name,Gender,DOB,Interests,City,Country
0,1,Jesse Lawhorn,Female,1958-10-15,"['Movies', 'Fashion', 'Fashion', 'Books']",Sibolga,Indonesia
1,2,Stacy Payne,Female,2004-07-21,"['Gaming', 'Finance and investments', 'Outdoor...",Al Abyār,Libya
2,3,Katrina Nicewander,Female,2000-02-07,"['DIY and crafts', 'Music', 'Science', 'Fashion']",Wādī as Sīr,Jordan
3,4,Eric Yarbrough,Male,1985-04-14,"['Outdoor activities', 'Cars and automobiles']",Matera,Italy
4,5,Daniel Adkins,Female,1955-09-18,"['Politics', 'History']",Biruaca,Venezuela


**Encoding User preference data**

In [62]:
mlb_df=pd.DataFrame(mlb.fit_transform(usersdf["Interests"]),
                    columns=[col.replace("'", "").replace('"', '') for col in mlb.classes_])
usersdf.drop(columns=['Interests'], inplace=True)
usersdf = pd.concat([usersdf, mlb_df], axis=1)

**Encoding Demographic Data of user**

In [63]:
demographic_data=pd.DataFrame()

In [64]:
usersdf['DOB'] = pd.to_datetime(usersdf['DOB'])
demographic_data['Age'] = usersdf['DOB'].apply(lambda x: datetime.now().year - x.year)

In [65]:
demographic_data['Gender']=LE.fit_transform(usersdf['Gender'])

In [71]:
demographic_data.head()

Unnamed: 0,Age,Gender
0,66,0
1,20,0
2,24,0
3,39,1
4,69,0


In [67]:
usersdf['Country'] = usersdf['Country'].apply(lambda x: country_mapping.get(x, len(country_mapping) + list(country_mapping.values()).count(1)))
new_countries = usersdf[~usersdf['Country'].isin(country_mapping.keys())]['Country'].unique()
new_country_mapping = {country: i for i, country in enumerate(new_countries, start=len(country_mapping))}
country_mapping.update(new_country_mapping)
usersdf['Country'] = usersdf['Country'].map(country_mapping)

In [68]:
usersdf.drop(columns=['UserID','City','DOB','Gender'],inplace=True)

In [69]:
usersdf.head(10)

Unnamed: 0,Name,Country,Art,Beauty,Books,Business and entrepreneurship,Cars and automobiles,Cooking,DIY and crafts,Education and learning,...,Outdoor activities,Parenting and family,Pets,Photography,Politics,Science,Social causes and activism,Sports,Technology,Travel
0,Jesse Lawhorn,34,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Stacy Payne,69,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,Katrina Nicewander,69,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,Eric Yarbrough,36,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,Daniel Adkins,69,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,Diane Jara,37,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,Sheryl Morgan,38,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,William Harper,69,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,Virginia Varron,69,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
9,Charles Figueroa,39,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
usersdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 31 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   Name                           100000 non-null  object
 1   Country                        100000 non-null  int64 
 2   Art                            100000 non-null  int64 
 3   Beauty                         100000 non-null  int64 
 4   Books                          100000 non-null  int64 
 5   Business and entrepreneurship  100000 non-null  int64 
 6   Cars and automobiles           100000 non-null  int64 
 7   Cooking                        100000 non-null  int64 
 8   DIY and crafts                 100000 non-null  int64 
 9   Education and learning         100000 non-null  int64 
 10  Fashion                        100000 non-null  int64 
 11  Finance and investments        100000 non-null  int64 
 12  Fitness                        100000 non-nul