In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("data/train_submission.csv")
df.head()

Unnamed: 0,ID,Usage,Text,Label
0,136,Public,Finalment Atena le recibe en l'acropoli d'Ate...,arg
1,62,Public,Jane Laffort fille de Joseph Laffort et d' Ang...,lat
2,74,Public,Сонзэ ялаксонзо - Роджер Джозеф Бошкович - у...,myv
3,40,Public,Mɛniɛ nkùɔ dìì mɔ̀nnì bɛnìtìbɛ̀ kɛ́deè kɛ̀ Nɔ...,tbz
4,30,Public,Ka go dirisa thekniki yeo ya phetogonepiso Le...,tsn


In [6]:
# Get example of each Label
df[["Label", "Text"]].groupby('Label').first()

Unnamed: 0_level_0,Text
Label,Unnamed: 1_level_1
abk,Саид иан дыҟам иаб дыҟам ишырҳәауа еиԥш дма...
ace,Meunasah Hagu di Keucamatan Baktiya Barat Acè...
ach,En owaci pe emito cung iwi bye pien ka madong ...
acm,دق بقلبي دق شبعني عشق يا اول بشر حبيته صدق
acr,Mi 'at tzij piquiwi iwachi'il queje ile i ...
...,...
zea,Dat oor d' as 't woord dat d'rop volg mee 'n k...
zho,宁波东洲传动件有限公司-新闻中心
zlm,Di sebelah barat haruslah ada panji-panji pasu...
zsm,Itu baru Hazwin kalau Linda bagaimana?


In [7]:
# Combine all text for each label
df_combined = df[["Label", "Text"]].groupby('Label').agg(lambda x: ' '.join(x))

# remove links, emojis, hashtags, mentions
import re

def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[a-zA-Z0-9_]+", "", text)
    text = re.sub(r"#[a-zA-Z0-9_]+", "", text)
    text = re.sub(r"[\U00010000-\U0010ffff]", "", text)
    return text

df_combined["Text"] = df_combined["Text"].apply(clean_text)
df_combined["Length"] = df_combined["Text"].apply(len)
df_combined

Unnamed: 0_level_0,Text,Length
Label,Unnamed: 1_level_1,Unnamed: 2_level_1
abk,Саид иан дыҟам иаб дыҟам ишырҳәауа еиԥш дма...,8693
ace,Meunasah Hagu di Keucamatan Baktiya Barat Acè...,12901
ach,En owaci pe emito cung iwi bye pien ka madong ...,12628
acm,دق بقلبي دق شبعني عشق يا اول بشر حبيته صدق ه...,5255
acr,Mi 'at tzij piquiwi iwachi'il queje ile i ...,18934
...,...,...
zea,Dat oor d' as 't woord dat d'rop volg mee 'n k...,13670
zho,宁波东洲传动件有限公司-新闻中心 我也有我的注重，老兄 你应该帮我想想，我才是你他妈的该注重...,2448
zlm,Di sebelah barat haruslah ada panji-panji pasu...,13934
zsm,Itu baru Hazwin kalau Linda bagaimana? Penamb...,11574


In [45]:
# Apply TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df_combined["Text"])

In [46]:
# Show similarity between each label
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim, index=df_combined.index, columns=df_combined.index)

fig = px.imshow(
    cosine_sim_df,
    labels={'x': 'Labels', 'y': 'Labels'},
    aspect='auto'
)
fig.update_layout(width=650, height=600)
fig.show()

In [48]:
# Show example of given label
label = "srp"
df[df["Label"] == label].head()

Unnamed: 0,ID,Usage,Text,Label
82,26,Public,Bila sam u Jerusalimu i Rimu u Kelnu i i Santj...,srp
87,140,Public,Dušo nema zbog èega da bude žao.,srp
148,32,Public,Признајем то имам тенденцију да будем али је...,srp
639,114,Public,Има такође и природног гаса и нафте.,srp
644,2,Public,Иако је напустио редовно школовање након прво...,srp


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Use the raw text and corresponding labels from df
df = df.dropna(subset=["Text", "Label"])
X = df["Text"]
y = df["Label"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit TF-IDF on training text and transform both train and test sets
tfidf = TfidfVectorizer(ngram_range=(2, 6), analyzer="char", max_features=2000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Logistic regression classifier
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

         abk       1.00      1.00      1.00        25
         ace       0.94      0.94      0.94        16
         ach       0.94      1.00      0.97        17
         acm       0.31      0.25      0.28        20
         acr       0.95      0.82      0.88        22
         ada       0.00      0.00      0.00         1
         afb       0.33      0.13      0.19        31
         afr       0.77      0.87      0.82        23
         ahk       0.91      1.00      0.95        20
         ajp       0.19      0.33      0.24        15
         aka       0.69      0.86      0.77        21
         aln       0.67      0.76      0.71        21
         als       0.37      0.29      0.33        24
         alt       0.73      0.76      0.74        21
         amh       0.84      0.78      0.81        27
         aoj       0.96      1.00      0.98        23
         apc       0.32      0.36      0.34        22
         ara       0.21    


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

