### Importing the required Libraries

In [521]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import ast

#### Loading the Dataset

In [522]:
data=pd.read_csv("data_video.csv")

In [523]:
## Removing the unnamed column i.e the index column
data.drop(columns=['Unnamed: 0'],axis=1,inplace=True)

## Content-based-filtering

In [524]:
data.head(5)

Unnamed: 0,id,category,title,comment_count,upvote_count,view_count,share_count,username,following,Name,posted_n_days_ago,popularity_score,genre,targeted_audience,post_description
0,1291,"['SolTok', 50, 'Ride the wave of Solana with S...",Can Moo Deng’s MEME COIN GO HIGHER_ #shorts,0,0,23,0,kinha,False,Sachin Kinha,41,41.210468,Unspecified,General Audience,Description not available.
1,1306,"['SolTok', 50, 'Ride the wave of Solana with S...",Culture of Solana Token $COST. # 2024 Trum...,0,0,28,0,kinha,False,Sachin Kinha,38,41.210468,Unspecified,General Audience,Description not available.
2,1276,"['SolTok', 50, 'Ride the wave of Solana with S...",Daily realisation #trading #crypto #bitcoin #e...,0,0,51,0,kinha,False,Sachin Kinha,41,43.002227,Unspecified,General Audience,Description not available.
3,1265,"['SolTok', 50, 'Ride the wave of Solana with S...",#crypto #cryptotrading #memecoin #solmemecoins...,0,0,33,0,kinha,False,Sachin Kinha,41,40.235948,Unspecified,General Audience,Description not available.
4,1314,"['SolTok', 50, 'Ride the wave of Solana with S...",😂#SOLANA #memecoin #crypto #dexscreener #bullrun,0,0,51,0,kinha,False,Sachin Kinha,38,41.845386,Unspecified,General Audience,Description not available.


### Dropping the duplicates on the basis of id and title because the title of many videos can be same but if the id of two or more videos are same it means that it is a duplicate

In [525]:
data.drop_duplicates(subset=['title','id'],inplace=True)

### Creating a function for to get the tags for the data

In [526]:
def create_tags_no_nested(row):
    tags = []
    for col in columns_to_use:
        value = row[col]
        if pd.notna(value):  
            if isinstance(value, (int, float, bool)):  
                value = str(value)
            elif isinstance(value, str) and value.startswith("[") and value.endswith("]"):
                try:
                    
                    parsed_list = ast.literal_eval(value)
                    if isinstance(parsed_list, list):  
                        tags.extend([str(item).strip() for item in parsed_list])
                except (ValueError, SyntaxError):
                    tags.append(value.strip())  
            else:
              
                tags.extend(value.split(',') if ',' in value else [value.strip()])
    return [tag.strip().lower() for tag in tags if tag] 
    
data['tags'] = data.apply(create_tags_no_nested, axis=1)

### Creating a new data for the algorithm

In [527]:
new=data[['id','title','tags','posted_n_days_ago','popularity_score']]

In [528]:
new.head(5)

Unnamed: 0,id,title,tags,posted_n_days_ago,popularity_score
0,1291,Can Moo Deng’s MEME COIN GO HIGHER_ #shorts,"[sachin kinha, soltok, 50, ride the wave of so...",41,41.210468
1,1306,Culture of Solana Token $COST. # 2024 Trum...,"[sachin kinha, soltok, 50, ride the wave of so...",38,41.210468
2,1276,Daily realisation #trading #crypto #bitcoin #e...,"[sachin kinha, soltok, 50, ride the wave of so...",41,43.002227
3,1265,#crypto #cryptotrading #memecoin #solmemecoins...,"[sachin kinha, soltok, 50, ride the wave of so...",41,40.235948
4,1314,😂#SOLANA #memecoin #crypto #dexscreener #bullrun,"[sachin kinha, soltok, 50, ride the wave of so...",38,41.845386


In [529]:
## Flattening the Lists of tags.
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['tags'] = new['tags'].apply(lambda x: " ".join(x))


Unnamed: 0,id,title,tags,posted_n_days_ago,popularity_score
0,1291,Can Moo Deng’s MEME COIN GO HIGHER_ #shorts,sachin kinha soltok 50 ride the wave of solana...,41,41.210468
1,1306,Culture of Solana Token $COST. # 2024 Trum...,sachin kinha soltok 50 ride the wave of solana...,38,41.210468
2,1276,Daily realisation #trading #crypto #bitcoin #e...,sachin kinha soltok 50 ride the wave of solana...,41,43.002227
3,1265,#crypto #cryptotrading #memecoin #solmemecoins...,sachin kinha soltok 50 ride the wave of solana...,41,40.235948
4,1314,😂#SOLANA #memecoin #crypto #dexscreener #bullrun,sachin kinha soltok 50 ride the wave of solana...,38,41.845386


### Converting the tags into vectors for similarity search

In [530]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [531]:
vector = cv.fit_transform(new['tags']).toarray()

In [532]:
vector.shape

(1003, 3747)

### Constructing the similarity Matrix

In [533]:
from sklearn.metrics.pairwise import cosine_similarity


In [534]:
similarity = cosine_similarity(vector)

In [535]:
similarity

array([[1.        , 1.        , 1.        , ..., 0.29633363, 0.24595493,
        0.27498597],
       [1.        , 1.        , 1.        , ..., 0.29633363, 0.24595493,
        0.27498597],
       [1.        , 1.        , 1.        , ..., 0.29633363, 0.24595493,
        0.27498597],
       ...,
       [0.29633363, 0.29633363, 0.29633363, ..., 1.        , 0.37483569,
        0.47894747],
       [0.24595493, 0.24595493, 0.24595493, ..., 0.37483569, 1.        ,
        0.3229876 ],
       [0.27498597, 0.27498597, 0.27498597, ..., 0.47894747, 0.3229876 ,
        1.        ]])

# Function for Building the User Profile

### Since i was not able to fetch the user data we will be creating a user profile based on the post data we have got
#### 1. This function will be core for the personlized recommendations.
#### 2. This function will return some things which will show tht interest of an user.such as the genres and the post summary of the video posted by the user in the last 7 days.
#### 3. The most popular video posted by the user in the last 7 days will be given to the recommendation function which will return the recommendation.

In [536]:
##Function for building the user-profile
def build_user_profile(username,data=data):
    user_posts = data[data['username'] == username]
    recent_posts = user_posts.sort_values('posted_n_days_ago', ascending=True).iloc[:7,:]
    
    user_genre=recent_posts['genre']
    user_interest=recent_posts['post_description']
    titles=recent_posts['title']
    max=recent_posts['popularity_score'].max()
    popular_post=recent_posts[recent_posts['popularity_score']==max].head(1)['title']
    
 
    user_profile = {
        'genres': user_genre.to_list(),
        'interest': user_interest.to_list(),
        'title':titles.to_list(),
        'popular_post':popular_post.to_list(),
    }
    
    return user_profile

## Function for Personlized Recommendations

In [537]:
def personlized_recommendation(username,n=5):
        user_profile = build_user_profile(username)
        video=user_profile['popular_post'][0]
        index = new[new['title'] == video].index[0]
        distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
        recommended_titles = [new.iloc[i[0]]['title'] for i in distances[1:n+1]]
        return recommended_titles

## Function for recommending videos for new Users on the basis of the genres they have selected 

In [538]:
def cold_start(username,n=2):
    user_profile = build_user_profile(username)
    result_videos = pd.DataFrame()
    for genre in user_profile['genres']:
        filtered_videos = user_posts[user_posts['genre'] == genre].sort_values(by='popularity_score', ascending=False).head(n)
        result_videos = pd.concat([result_videos, filtered_videos], ignore_index=True).sort_values(by='popularity_score',ascending=False)
    return set(result_videos['title'].to_list())

## Function for Recommending the trending videos

In [539]:
def calculate_trending_score(df, weights=None):
    if weights is None:
        weights = {
            'view_count': 0.3,
            'upvote_count': 0.2,
            'share_count': 0.2,
            'comment_count': 0.1,
            'popularity_score': 0.1,
            'recency': 0.1
        }
    scaler = MinMaxScaler()
    for col in ['view_count', 'upvote_count', 'share_count', 'comment_count', 'popularity_score']:
        df[f'scaled_{col}'] = scaler.fit_transform(df[[col]])
    df['recency_factor'] = 1 / (1 + df['posted_n_days_ago'])
    df['trending_score'] = (
        weights['view_count'] * df['scaled_view_count'] +
        weights['upvote_count'] * df['scaled_upvote_count'] +
        weights['share_count'] * df['scaled_share_count'] +
        weights['comment_count'] * df['scaled_comment_count'] +
        weights['popularity_score'] * df['scaled_popularity_score'] +
        weights['recency'] * df['recency_factor']
    )
    
    return df[['title', 'trending_score']].sort_values('trending_score', ascending=False).head(10)

### Testing the function

In [540]:
print("Content-Based Recommendations for Existing User:")
videos=personlized_recommendation(username='kinha')
print(videos)

Content-Based Recommendations for Existing User:
['Culture of Solana Token $COST.   #  2024  Trump.  Trump Supporters', 'Daily realisation #trading #crypto #bitcoin #ethereum #solana #memecoins', '#crypto #cryptotrading #memecoin #solmemecoins #solanamemecoin', '😂#SOLANA #memecoin #crypto #dexscreener #bullrun', 'Pump fun and the trenches in shambles #pumpfun #solana #memecoins']


In [541]:
print("Recommendations for New User:")
new_rec=cold_start(username='afrobeezy')
print(new_rec)

Recommendations for New User:
{'act now!', 'push through the pain 🫡', 'UntitledVideo'}


In [542]:
trending_videos = calculate_trending_score(data)
trending_videos['title'].to_list()

['What is DAI Stablecoin',
 'Silicon Valley Bank has sent shockwaves through the crypto world by causing $USDC to depeg.',
 'Why Should I Buy $BNB === People always ask WHEN a good time to buy crypto is, but they should really be asking WHY they should',
 'Did you miss out on $PEPE  \u2028People turned $100 into $1 million in the past 30 day',
 'UntitledVideo',
 'Recipe for a flow state',
 'UntitledVideo',
 'Walk in nature daily. Because to spend time with beauty is to enrich your relationship with beauty.\xa0#leadership #productivity #service',
 'UntitledVideo',
 "Big News_ Blackrock's Ethereum ETF Likely to Be Approved in July! This will cause a big pump for the crypto market so now is the tike to front run #cryptonews #ethereum #eth #fyp #foryou #CapCut"]

## End of Project