In [32]:
import pandas as pd 
import neattext.functions as nfx

In [33]:
# Load ML/Rc Pkgs
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity,linear_kernel

In [34]:
file_paths = ["../datasets/youtube/amine.csv", "../datasets/youtube/imad.csv", "../datasets/youtube/anis.csv", "../datasets/youtube/haroune.csv"]
data_frames = [pd.read_csv(file) for file in file_paths]

In [35]:
columns = ['Video ID', 'Channel ID', 'Title', 'Category ID', 'Tags', 'Channel Title',
           'Subscriber Count', 'Views', 'Likes', 'Dislikes', 'Comment Count', 'URL',
           'Language', 'ISO Duration', 'Duration (Seconds)', 'Theme']


In [36]:
merged_df = pd.DataFrame(columns=columns)

for file_path in file_paths:
    # Read the CSV file
    df = pd.read_csv(file_path, header=None)

    # Add column headers
    df.columns = columns

    # Concatenate the data
    merged_df = pd.concat([merged_df, df], ignore_index=True)

# Remove duplicates
merged_df.drop_duplicates(inplace=True)
merged_df = merged_df.drop(columns = ['Dislikes','ISO Duration','Video ID','Channel ID','URL'] ,axis=1)

# Save the merged and deduplicated DataFrame to a new CSV file
merged_df.to_csv('merged_data.csv', index=False)

In [37]:
df =merged_df

# Count missing values in each column
missing_values = df.isnull().sum()

# Display the number of missing values
print(missing_values)

Title                    0
Category ID              0
Tags                  2835
Channel Title            0
Subscriber Count         0
Views                    0
Likes                    0
Comment Count            0
Language                 0
Duration (Seconds)       0
Theme                    0
dtype: int64


In [38]:
df.describe()

Unnamed: 0,Title,Category ID,Tags,Channel Title,Subscriber Count,Views,Likes,Comment Count,Language,Duration (Seconds),Theme
count,13445,13445,10610,13445,13445,13445,13445,13445,13445,13445,13445
unique,9729,14,7325,5215,2725,12004,5948,1746,49,2951,226
top,NASA Just Shut Down Quantum Computer After Som...,27,"webdev, app development, lesson, tutorial",Simplilearn,3540000,5,0,0,en,60,python programming
freq,12,6555,96,527,468,12,387,1723,6534,165,148


In [39]:
languages_to_delete = ['ur', 'ta', 'ml', 'de', 'id', 'pt-BR', 'bn', 'es', 'te', 'si', 'it', 'es-419',
                        'th', 'zh-Hans', 'ru', 'ja', 'pt-PT', 'ko', 'nl', 'tlh', 'bn-IN', 'ro', 'es-ES',
                        'en-IE', 'sv', 'arc', 'tr', 'zh-CN', 'de-DE', 'da', 'so', 'fil', 'zh', 'es-MX',
                        'mr', 'zh-TW']

# Filter the DataFrame to exclude rows with the specified languages
df = df[~df['Language'].isin(languages_to_delete)]

# Define a mapping for 'Language'
language_mapping = {
    'en': 'en',
    'en-GB': 'en',
    'en-US': 'en',
    'Unknown': 'Unknown',
    'en-IN': 'en',
    'hi': 'hi',
    'en-CA': 'en',
    'zxx': 'Unknown',
    'pt': 'pt',
    'ar': 'ar',
    'fr': 'fr',
    'fr-CA': 'fr',
    'fr-FR': 'fr',
}

# Map the values in the 'Language' column
df['Language'] = df['Language'].map(language_mapping)
language_counts = df['Language'].value_counts()

# Display the counts
print(language_counts)



Language
en         9358
Unknown    3473
hi          366
pt           26
ar           23
fr           16
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Language'] = df['Language'].map(language_mapping)


In [40]:
df.head()
df.rename(columns={'Title': 'course_title'}, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'Title': 'course_title'}, inplace=True)


In [41]:
df.shape

(13262, 11)

In [42]:
df_cleaned = df.drop(['Category ID', 'Tags'], axis=1)


In [43]:
#we remove nan values 
df_cleaned.isna().sum()

course_title          0
Channel Title         0
Subscriber Count      0
Views                 0
Likes                 0
Comment Count         0
Language              0
Duration (Seconds)    0
Theme                 0
dtype: int64

In [44]:
test_row = df_cleaned.iloc[0]
test_row_df = test_row.to_frame().transpose()

# Resetting the index if necessary
test_row_df.reset_index(drop=True, inplace=True)

print(test_row_df)

                                        course_title     Channel Title  \
0  Harvard CS50’s Artificial Intelligence with Py...  freeCodeCamp.org   

  Subscriber Count    Views  Likes Comment Count Language Duration (Seconds)  \
0          8820000  1357207  41136           633       en              42682   

        Theme  
0  ai courses  


In [45]:
# Clean Text:stopwords,special charac
df_cleaned['clean_course_title'] = df_cleaned['course_title'].apply(nfx.remove_stopwords)
# Clean Text:stopwords,special charac
df_cleaned['clean_course_title'] = df_cleaned['clean_course_title'].apply(nfx.remove_special_characters)

df_cleaned[['course_title','clean_course_title']] 
df_cleaned.head()

Unnamed: 0,course_title,Channel Title,Subscriber Count,Views,Likes,Comment Count,Language,Duration (Seconds),Theme,clean_course_title
0,Harvard CS50’s Artificial Intelligence with Py...,freeCodeCamp.org,8820000,1357207,41136,633,en,42682,ai courses,Harvard CS50s Artificial Intelligence Python ...
1,Artificial Intelligence Full Course | Artifici...,edureka!,3900000,3330531,67797,883,en,17571,ai courses,Artificial Intelligence Course Artificial Int...
2,Google just launched a free course on AI. You'...,Python Programmer,405000,244645,6421,142,en,188,ai courses,Google launched free course AI like
3,Google’s AI Course for Beginners (in 10 minutes)!,Jeff Su,652000,267451,4531,200,en,558,ai courses,Googles AI Course Beginners in 10 minutes
4,Introduction to Generative AI,Google Cloud Tech,1080000,1218371,17346,289,en,1328,ai courses,Introduction Generative AI


In [46]:
# Vectorize our Text
count_vect = TfidfVectorizer()
cv_mat = count_vect.fit_transform(df_cleaned['clean_course_title'])

In [47]:
cv_mat.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [48]:
df_cv_words = pd.DataFrame(cv_mat.todense(),columns=count_vect.get_feature_names_out())
count_vect.get_feature_names_out()[385]
df_cv_words



Unnamed: 0,00,001,0016,007,01,01nsj,02,03,0301,033,...,zone,zoo,zoom,zos,zrich,zsense,ztransform,zuckerberg,zurich,zyxel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Assuming df_cleaned is your original DataFrame
# and you want to normalize all numeric columns

# Select the numeric columns you want to normalize
numeric_columns = df_cleaned.select_dtypes(include=['number'])

# Check if there are any numeric columns
if not numeric_columns.empty:
    # Convert numeric_columns to a NumPy array
    numeric_array = numeric_columns.values

    # Create a MinMaxScaler object
    scaler = MinMaxScaler()

    # Fit the scaler to your numeric data
    scaler.fit(numeric_array)

    # Transform the numeric data to the [0, 1] range
    normalized_data = scaler.transform(numeric_array)

    # Create a new DataFrame with the normalized data
    normalized_df = pd.DataFrame(normalized_data, columns=numeric_columns.columns)
else:
    # Handle the case where there are no numeric columns
    normalized_df = pd.DataFrame()
    print("No numeric columns found for normalization.")


No numeric columns found for normalization.


In [50]:
df_cv_words=df_cv_words.reset_index(drop=True)
normalized_df=normalized_df.reset_index(drop=True)


df_final = pd.concat([normalized_df,df_cv_words],axis=1)
df_final.tail(20)

df_final.isna().sum()

00            0
001           0
0016          0
007           0
01            0
             ..
zsense        0
ztransform    0
zuckerberg    0
zurich        0
zyxel         0
Length: 9325, dtype: int64

In [51]:
# Cosine Similarity Matrix
cosine_sim_mat = cosine_similarity(cv_mat)
similarity_matrix = cosine_similarity(df_final)


In [52]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Assuming you have the following variables defined:
# df - your dataframe containing course titles
# course_indices - a mapping from course titles to their index in the dataframe
# cosine_sim_mat - your precomputed cosine similarity matrix

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

def preprocess_title(title):
    # Tokenize the title
    tokens = word_tokenize(title)
    # Convert to lower case
    tokens = [token.lower() for token in tokens]
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HR\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HR\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [53]:
df_cleaned
df = df_cleaned

In [54]:
preprocess_title("web dev on - course")

['web', 'dev', '-', 'course']

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process

In [56]:

def recommend_course(text,num_of_rec=10):
    def find_best_matches(processed_tokens, df, num_of_rec=10):
        # Combine tokens back to a string
        processed_title = ' '.join(processed_tokens)
        
        # Create a TF-IDF Vectorizer object
        vectorizer = TfidfVectorizer()
        
        # Fit and transform the course titles
        tfidf_matrix = vectorizer.fit_transform(df_cleaned['course_title'])
        
        # Transform the processed title
        processed_tfidf = vectorizer.transform([processed_title])
        
        # Calculate cosine similarity
        cosine_similarities = cosine_similarity(processed_tfidf, tfidf_matrix).flatten()
        
        # Get the top matches indices sorted by similarity score
        top_matches_indices = cosine_similarities.argsort()[-num_of_rec:][::-1]

        # Get the top matches similarity scores
        top_matches_scores = cosine_similarities[top_matches_indices]

        # Create a DataFrame with indices and scores
        top_matches_df = pd.DataFrame({'index': top_matches_indices, 'similarity_score': top_matches_scores})
        
        # Merge the original DataFrame with the top matches DataFrame
        merged_df = top_matches_df.merge(df, left_on='index', right_index=True)
        merged_df = merged_df[merged_df['similarity_score']>0.25]
        
        return merged_df.drop(columns=['index'])
    processed_tokens = preprocess_title(text)
    return find_best_matches(processed_tokens,df,num_of_rec=num_of_rec)




In [57]:
recommend_course('artificial intelligence',20)

Unnamed: 0,similarity_score,course_title,Channel Title,Subscriber Count,Views,Likes,Comment Count,Language,Duration (Seconds),Theme,clean_course_title
0,0.834978,This is the open source software I use each da...,Awesome Open Source,117000,154925,4607,180,en,1831,Open Source Software,open source software use day 2023this 2 2 seri...
1,0.828979,What Programming Languages should you Learn fo...,Robotix with Sina,50700,58429,3711,69,en,47,robotics and automation,Programming Languages Learn Robotics
2,0.828979,Residue theorem/L3/Residue theorem application...,H.D. MATHEMATICS,88300,3100,125,32,Unknown,1323,"Complex Analysis: Analytic Functions, Conforma...",Residue theoremL3Residue theorem application e...
3,0.828979,Artificial Intelligence Full Course | Artifici...,edureka!,3900000,3330531,67797,883,en,17571,ai courses,Artificial Intelligence Course Artificial Int...
4,0.828979,What is an Open Source Software With Full Info...,Quick Support,6810000,226803,10643,290,Unknown,532,Open Source Software,Open Source Software Information Hindi Quick...
6,0.821319,How Open-Source Software Can Change Our Lives ...,TEDx Talks,39800000,27609,441,34,Unknown,783,Open Source Software,OpenSource Software Change Lives Brad Griffit...
7,0.821319,COMPLEX INTEGRATION | LINE INTEGRAL | S-1 | EN...,SAURABH DAHIVADKAR,232000,58850,986,23,hi,650,"Complex Analysis: Analytic Functions, Conforma...",COMPLEX INTEGRATION LINE INTEGRAL S1 ENGINE...
8,0.807345,Lecture 6: Version Control (git) (2020),Missing Semester,92900,641460,13612,303,en,5100,Version Control Systems,Lecture 6 Version Control git 2020
9,0.802559,The truth about Open Source 👩‍💻 #programming #...,Coding with Lewis,483000,267003,14021,214,Unknown,33,Open Source Software,truth Open Source programming tech code techn...
10,0.802559,Necessary Condition for w = f(z) to represent ...,Spectrum of Mathematics,980,450,8,0,en,779,"Complex Analysis: Analytic Functions, Conforma...",Necessary Condition w fz represent Conformal ...
