In [19]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [20]:
data = pd.read_csv("\computer vision\datasets\countries-languages.csv")

In [21]:
data

Unnamed: 0,Country,Languages Spoken
0,Afghanistan,"Dari Persian, Pashtu (both official), other Tu..."
1,Albania,"Albanian (Tosk is the official dialect), Greek"
2,Algeria,"Arabic (official), French, Berber dialects"
3,Andorra,"Catalán (official), French, Castilian, Portuguese"
4,Angola,"Portuguese (official), Bantu and other African..."
...,...,...
193,Vietnam,Vietnamese (official); English (increasingly f...
194,Western Sahara (proposed state),"Hassaniya Arabic, Moroccan Arabic"
195,Yemen,Arabic
196,Zambia,"English (official); major vernaculars: Bemba, ..."


In [22]:
data.columns

Index(['Country', 'Languages Spoken'], dtype='object')

In [23]:
data['Languages_Spoken'] = data['Languages Spoken'].fillna('')


In [24]:
data_new = data.drop(['Languages Spoken'], axis=1, errors='ignore')  # Added errors='ignore' to avoid KeyError if the column does not exist

In [25]:
s = data_new['Languages_Spoken'].str.split(', ').apply(pd.Series, 1).stack()

In [26]:
s.index = s.index.droplevel(-1)

In [27]:
s.name = 'Language_Spoken'

In [28]:
data_new = data_new.join(s)

In [29]:
data_new['Language_Spoken'] = data_new['Language_Spoken'].fillna('')

In [36]:
data_new

Unnamed: 0,Country,Languages_Spoken,Language_Spoken
0,Afghanistan,"Dari Persian, Pashtu (both official), other Tu...",Dari Persian
0,Afghanistan,"Dari Persian, Pashtu (both official), other Tu...",Pashtu (both official)
0,Afghanistan,"Dari Persian, Pashtu (both official), other Tu...",other Turkic and minor languages
1,Albania,"Albanian (Tosk is the official dialect), Greek",Albanian (Tosk is the official dialect)
1,Albania,"Albanian (Tosk is the official dialect), Greek",Greek
...,...,...,...
196,Zambia,"English (official); major vernaculars: Bemba, ...",Tonga; about 70 other indigenous languages
197,Zimbabwe,"English (official), Shona, Ndebele (Sindebele)...",English (official)
197,Zimbabwe,"English (official), Shona, Ndebele (Sindebele)...",Shona
197,Zimbabwe,"English (official), Shona, Ndebele (Sindebele)...",Ndebele (Sindebele)


In [30]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data_new['Language_Spoken'])

In [31]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [32]:
indices = pd.Series(data_new.index, index=data_new['Country']).drop_duplicates()

In [35]:
def language_recommender(language, cosine_sim=cosine_sim, df=data_new, indices=indices):
    lang_indices = df[df['Language_Spoken'].str.contains(language)].index.tolist()
    if not lang_indices:
        return []
    sim_scores = [np.mean([cosine_sim[i][j] for j in lang_indices]) for i in range(len(df))]
    sorted_scores = sorted(list(enumerate(sim_scores)), key=lambda x: x[1], reverse=True)
    sorted_scores = sorted_scores[1:11] 
    country_indices = [i[0] for i in sorted_scores]
    return df['Country'].iloc[country_indices]

In [37]:
language_recommender('Dari Persian')

185    United Arab Emirates
78                     Iran
0               Afghanistan
0               Afghanistan
1                   Albania
1                   Albania
2                   Algeria
2                   Algeria
2                   Algeria
3                   Andorra
Name: Country, dtype: object