# *  ****Librairies****

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import torch
from transformers import AutoTokenizer, AutoModel
from transformers import BertTokenizer,AutoModel,AdamW,AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
import torch.nn.functional as F
import torch.nn as nn
device = "cuda" if torch.cuda.is_available() else "cpu"
%env TOKENIZERS_PARALLELISM=true
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
class CFG:
    INPUT = '/kaggle/input/learning-equality-curriculum-recommendations'
    MODEL ='/kaggle/input/paraphrasemultilingualminilml12v2'
    MAX_LEN = 511
    SELECT_TOP_N = 5

# **Topics dataset**

In [None]:

topics = pd.read_csv(f'{CFG.INPUT}/topics.csv')

In [None]:
topics.head

In [None]:
topics.info

In [None]:
print("Missing values : ")
topics.isnull().sum()

In [None]:
#Count language
language = topics['language'].value_counts(dropna = False).sort_index()
language

In [None]:

# visualization of language
plt.figure(figsize=(10,10))
sns.countplot(x='language',data=topics)
plt.xticks(rotation=90)
plt.title('language')
plt.show()

In [None]:

# visualization of level
plt.figure(figsize=(10,10))
sns.countplot(x='level',data=topics)
plt.xticks(rotation=90)
plt.title('level')
plt.show()

In [None]:
#count channel
channel = topics.channel.value_counts().head(20)
channel

In [None]:
channel.plot(kind = "bar", color = "darkblue", figsize = (20,10))
plt.title("Top channel")
plt.xlabel("name of channel")
plt.ylabel("count of channel")
plt.show()

In [None]:
#category #

category_ = topics[["category", "has_content"]].groupby("category").sum().sort_values(by ='has_content', ascending =False)[1:21]
category_

In [None]:
#Plot  Category|has_content

category_.plot(kind = "bar", color = "red", figsize = (20,10))
plt.title("Number of has_content in category_")
plt.xlabel("Category")
plt.ylabel(" has_content")
plt.show()

In [None]:
# Visualizing level data based on category
sns.countplot(data=topics, x='level', hue='category', palette=['blue','green'])

In [None]:
sns.countplot(data=topics, x='level', hue='has_content', palette=['red','blue'])

In [None]:
for col in ["level","category", "has_content", "channel","language"]:
    y = topics.groupby(col).size().reset_index()
    y['percentage'] = topics.groupby(col).size().groupby(level=0).apply(lambda x: 100 * x / topics.shape[0]).values
    y.columns = [col,'percentage', 'counts']
    fig = px.bar(y, x=col, y=['counts'], color=col, text=y['percentage'].apply(lambda x: '{0:1.2f}%'.format(x)))
    fig.show()

# * **corelations dataset****

In [None]:
corelations = pd.read_csv(f'{CFG.INPUT}/correlations.csv')
corelations.head

In [None]:
corelations.info

In [None]:
corelations.shape

In [None]:
#Find the duplicates
corelations.duplicated().sum()

In [None]:
# Checking the null values in corelations
print("Missing values : ")
corelations.isnull().sum()

# **content dataset**

In [None]:
content = pd.read_csv(f'{CFG.INPUT}/content.csv')
content.head

In [None]:
content.info

In [None]:
content.columns

In [None]:
content.shape

In [None]:
# Checking the null values in content
print("Missing values : ")
content.isnull().sum()

In [None]:
#Find the duplicates
content.duplicated().sum()

In [None]:

color_pal = sns.color_palette("bright")
figure = plt.figure(figsize=(8,8))
fig = sns.countplot(y=content.kind,color=color_pal[8],edgecolor="#111111")
for i in fig.containers:
    fig.bar_label(i,padding=10)
plt.title('Distribution of the kinds  in the content',fontsize=15)
plt.show()

In [None]:
# visualization of language
plt.figure(figsize=(10,10))
sns.countplot(x='language',data=content)
plt.xticks(rotation=90)
plt.title('Language')
plt.show()

In [None]:
for col in ["kind","license"]:
    y = content.groupby(col).size().reset_index()
    y['percentage'] = content.groupby(col).size().groupby(level=0).apply(lambda x: 100 * x / content.shape[0]).values
    y.columns = [col,'percentage', 'counts']

    fig = px.bar(y, x=col, y=['counts'], color=col, text=y['percentage'].apply(lambda x: '{0:1.2f}%'.format(x)))
    fig.show()

In [None]:
#kind in language arabic
plt.figure(figsize=(15,10))
pie_content = pd.DataFrame(content[content['language'] == "ar"]['kind'].value_counts())
pie_content.plot.pie(subplots=True, labels = content.index.values, autopct='%1.1f%%', startangle= 82)
plt.title('kind in language arabic')
plt.gca().set_aspect('equal')

# **Dataset submission**

In [None]:
sub = pd.read_csv(f'{CFG.INPUT}/sample_submission.csv')
sub

# **Model : paraphrasemultilingualminilml12v2**

In [None]:
model = AutoModel.from_pretrained(CFG.MODEL)
model.eval()
model.to(device)

tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL)

In [None]:
from tqdm.auto import tqdm
vecs = []
for _, row in tqdm(content.iterrows(), total=len(content)):
    title = row['title']
    if type(title) is float:
        title = row['description']
    if type(title) is float:
        title = row['text']
    
    tok = tokenizer(title)
    for k, v in tok.items():
        tok[k] = torch.tensor(v[:CFG.MAX_LEN]).to(device).unsqueeze(0)
    with torch.no_grad():
        output = model(**tok)
    vec = output.last_hidden_state.squeeze(0).mean(0).cpu()
    vecs.append(vec)
    
vecs1 = torch.stack(vecs)

In [None]:
topic_ids = sub['topic_id'].tolist()
topics_df = topics.query(f'id in {topic_ids}')
vecs = []
for _, row in tqdm(topics_df.iterrows(), total=len(topics_df)):
    title = row['title']
    if type(title) is float:
        title = row['description']
    if type(title) is float:
        title = "This content contains no text."
    
    tok = tokenizer(title)
    for k, v in tok.items():
        tok[k] = torch.tensor(v[:CFG.MAX_LEN]).to(device).unsqueeze(0)
    with torch.no_grad():
        output = model(**tok)
    vec = output.last_hidden_state.squeeze(0).mean(0).cpu()
    vecs.append(vec)
    
vecs2 = torch.stack(vecs)

In [None]:
import cupy as cp
from cuml.metrics import pairwise_distances
vecs1 = cp.asarray(vecs1)
vecs2 = cp.asarray(vecs2)
predicts = []
for v2 in vecs2:
    sim = pairwise_distances(v2.reshape(1, len(v2)), vecs1, metric='cosine')
    p = " ".join([content.loc[s, 'id'] for s in sim.argsort(1)[0, :CFG.SELECT_TOP_N].get()])
    predicts.append(p)
sub['content_ids'] = predicts
sub.head()

In [None]:
sub.to_csv('submission.csv', index=None)