## importing necessary Libraries and dataset

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
import requests

In [34]:
influencer_df=pd.read_csv("influencers.csv") # Content Data

In [35]:
influencer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   S.no           1000 non-null   int64 
 1   username       1000 non-null   object
 2   followers      1000 non-null   object
 3   Country        996 non-null    object
 4   viewers        1000 non-null   object
 5   Average views  1000 non-null   object
 6   Categories     1000 non-null   object
dtypes: int64(1), object(6)
memory usage: 54.8+ KB


In [36]:
influencer_df.head()

Unnamed: 0,S.no,username,followers,Country,viewers,Average views,Categories
0,1,cristiano,477.9M,India,5M,6.2M,"'Parenting and family', 'Art', 'History'"
1,2,kyliejenner,368.1M,United States,3.5M,5.5M,"'Finance and investments', 'Travel', 'Pets'"
2,3,arianagrande,329.6M,United States,2.9M,4M,"'Science', 'Movies', 'Beauty'"
3,4,leomessi,358.6M,Indonesia,2.7M,3.5M,"'Cars and automobiles', 'Beauty', 'Health and ..."
4,5,zendaya,151.1M,United States,4.3M,5.8M,"'Parenting and family', 'Beauty', 'Art'"


## Data Preprocessing for influencer data

**Dealing with duplicate data**

In [37]:
print(f'{influencer_df.duplicated().sum()} duplicates detected')

0 duplicates detected


**Encoding text labels**

In [38]:
influencer_df["Categories"]=influencer_df["Categories"].apply(lambda x: x.split(", "))

In [39]:
mlb = MultiLabelBinarizer()
content_cats = pd.DataFrame(mlb.fit_transform(influencer_df['Categories']), 
                      columns=[col.replace("'", "").replace('"', '') for col in mlb.classes_]
                     )

In [40]:
influencer_df.drop(columns=['Categories','S.no'], inplace=True)

**Encoding engagement data**

In [41]:
def unit_stdzer(value):
    if value.endswith('K'):
        return float(value[:-1])
    elif value.endswith('M'):
        return float(value[:-1]) * 1000
    else:
        return float(value)

In [44]:
influencer_df.rename(columns={'followers':'followers(in K)', 'Average views':'avg views(in K)' ,'viewers':'viewers(in K)'},inplace=True)

In [45]:
num_cols=['followers(in K)', 'avg views(in K)','viewers(in K)']
for col in num_cols:
    influencer_df[col] = influencer_df[col].apply(unit_stdzer)

In [46]:
scaler = StandardScaler()
influencer_df[num_cols] = scaler.fit_transform(influencer_df[num_cols])

**Encoding demographic data**

In [47]:
# Treating null values
influencer_df['Country']=influencer_df['Country'].replace({None:influencer_df['Country'].mode()[0]})

Apply the function to the influencer data


In [48]:
influencer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   username         1000 non-null   object 
 1   followers(in K)  1000 non-null   float64
 2   Country          1000 non-null   object 
 3   viewers(in K)    1000 non-null   float64
 4   avg views(in K)  1000 non-null   float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


In [49]:
def get_lat_long(country, geo_cache):
    if country in geo_cache:
        return geo_cache[country]
    
    api_key="4b294297394b443cb57a072420ade0b6" # your opencage api key
    url = f"https://api.opencagedata.com/geocode/v1/json?q={country}&key={api_key}"
    
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        data = response.json()
        
        if data['results']:
            lat = data['results'][0]['geometry']['lat']
            long = data['results'][0]['geometry']['lng']
            geo_cache[country] = (lat, long)  # Cache the result for future use
            return lat, long
        else:
            return None, None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching lat-long for {country}: {e}")
        return None, None

In [50]:
influencer_lat = []
influencer_long = []
geo_cache={}
for index, row in influencer_df.iterrows():
    lat, long = get_lat_long(row['Country'],geo_cache)
    influencer_lat.append(lat)
    influencer_long.append(long)

influencer_df['latitude'] = influencer_lat
influencer_df['longitude'] = influencer_long

In [51]:
influencer_df.drop(columns=["Country"],inplace=True)

In [52]:
influencer_df.head()

Unnamed: 0,username,followers(in K),viewers(in K),avg views(in K),latitude,longitude
0,cristiano,10.023948,4.612757,5.077109,22.351115,78.667743
1,kyliejenner,7.583637,3.051233,4.424813,39.78373,-100.445882
2,arianagrande,6.727972,2.426624,3.027035,39.78373,-100.445882
3,leomessi,7.372499,2.218421,2.56111,-2.483383,117.890285
4,zendaya,2.760799,3.884046,4.704368,39.78373,-100.445882


# Data Preprocessing users data

In [53]:
usersdf=pd.read_csv("SocialMediaUsers.csv") # User Data 

In [54]:
usersdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   UserID     100000 non-null  int64 
 1   Name       100000 non-null  object
 2   Gender     100000 non-null  object
 3   DOB        100000 non-null  object
 4   Interests  100000 non-null  object
 5   City       100000 non-null  object
 6   Country    100000 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.3+ MB


In [55]:
usersdf['Interests']=usersdf['Interests'].apply(lambda x: x.split(", "))

In [56]:
usersdf.head()

Unnamed: 0,UserID,Name,Gender,DOB,Interests,City,Country
0,1,Jesse Lawhorn,Female,1958-10-15,"['Movies', 'Fashion', 'Fashion', 'Books']",Sibolga,Indonesia
1,2,Stacy Payne,Female,2004-07-21,"['Gaming', 'Finance and investments', 'Outdoor...",Al Abyār,Libya
2,3,Katrina Nicewander,Female,2000-02-07,"['DIY and crafts', 'Music', 'Science', 'Fashion']",Wādī as Sīr,Jordan
3,4,Eric Yarbrough,Male,1985-04-14,"['Outdoor activities', 'Cars and automobiles']",Matera,Italy
4,5,Daniel Adkins,Female,1955-09-18,"['Politics', 'History']",Biruaca,Venezuela


**Encoding User preference data**

In [57]:
user_preferences=pd.DataFrame(mlb.fit_transform(usersdf["Interests"]),
                    columns=[col.replace("'", "").replace('"', '') for col in mlb.classes_])
usersdf.drop(columns=['Interests'], inplace=True)

In [58]:
user_preferences

Unnamed: 0,Art,Beauty,Books,Business and entrepreneurship,Cars and automobiles,Cooking,DIY and crafts,Education and learning,Fashion,Finance and investments,...,Outdoor activities,Parenting and family,Pets,Photography,Politics,Science,Social causes and activism,Sports,Technology,Travel
0,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99996,0,0,0,0,1,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
99997,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99998,0,0,0,1,0,0,0,0,1,0,...,0,1,0,1,0,0,0,0,0,0


**Encoding Demographic Data of user**

In [59]:
usersdf['DOB'] = pd.to_datetime(usersdf['DOB'])
usersdf['DOB'] = usersdf['DOB'].apply(lambda x: datetime.now().year - x.year)
usersdf.rename(columns={'DOB':'Age'}, inplace=True)

In [60]:
usersdf['Gender'] = np.where(usersdf['Gender']=='Male', 1, 0)

In [61]:
user_lat = []
user_long = []
geo_cache={}
for index, row in usersdf.iterrows():
    lat, long = get_lat_long(row['Country'],geo_cache)
    user_lat.append(lat)
    user_long.append(long)

usersdf['latitude'] = user_lat
usersdf['longitude'] = user_long

In [62]:
usersdf.drop(columns=["UserID","City","Country"],inplace=True)

In [63]:
usersdf.head(10)

Unnamed: 0,Name,Gender,Age,latitude,longitude
0,Jesse Lawhorn,0,66,-2.483383,117.890285
1,Stacy Payne,0,20,26.823447,18.123672
2,Katrina Nicewander,0,24,31.166705,36.941628
3,Eric Yarbrough,1,39,42.638426,12.674297
4,Daniel Adkins,0,69,8.001871,-66.110932
5,Diane Jara,1,57,39.78373,-100.445882
6,Sheryl Morgan,0,55,54.702354,-3.276575
7,William Harper,1,59,14.584444,29.491769
8,Virginia Varron,1,40,25.624262,42.352833
9,Charles Figueroa,0,21,22.351115,78.667743


In [67]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # Radius of the Earth in kilometers
    lat1_rad = np.radians(lat1)
    lon1_rad = np.radians(lon1)
    lat2_rad = np.radians(lat2)
    lon2_rad = np.radians(lon2)

    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad

    a = (np.sin(dlat / 2) ** 2 +
         np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2) ** 2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return R * c  # Distance in kilometers

In [68]:
distance_matrix = pd.DataFrame(index=usersdf.index, columns=influencer_df.index)

# Fill the distance matrix with calculated distances
for user_idx, user_row in usersdf.iterrows():
    for influencer_idx, influencer_row in influencer_df.iterrows():
        distance = haversine(user_row['latitude'], user_row['longitude'],
                             influencer_row['latitude'], influencer_row['longitude'])
        distance_matrix.at[user_idx, influencer_idx] = distance

KeyboardInterrupt: 

In [None]:
epsilon = 1e-5
weights_matrix = 1 / (distance_matrix + epsilon)

Normalize the weights

In [None]:
weights_matrix = weights_matrix.div(weights_matrix.sum(axis=1), axis=0)

In [64]:
cos_sim = cosine_similarity(user_preferences, content_cats)

In [65]:
cos_sim

array([[0.        , 0.        , 0.33333333, ..., 0.        , 0.        ,
        0.33333333],
       [0.        , 0.57735027, 0.        , ..., 0.28867513, 0.28867513,
        0.28867513],
       [0.        , 0.        , 0.28867513, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.40824829, 0.        ,
        0.        ],
       [0.25819889, 0.        , 0.        , ..., 0.25819889, 0.        ,
        0.        ],
       [0.        , 0.40824829, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [66]:
user_name="Charles Figueroa"
user_idx = usersdf[usersdf['Name'] == user_name].index[0]
influencer_indices = np.argsort(cos_sim[user_idx])[::-1]
top_influencers = influencer_df.iloc[influencer_indices[:10]]
list(top_influencers['username'])

['antony00',
 'everyone_woo',
 'pooorblack',
 'twentyonepilots',
 'jeonghaniyoo_n',
 'sunnyleone',
 'skuukzky',
 'badbunnypr',
 'ridwankamil',
 'l7nnon']