In [3]:
from kaggle.api.kaggle_api_extended import KaggleApi
from os import environ
import json

with open('./.kaggle/kaggle.json') as f:
    d = json.loads(f.read())

environ['KAGGLE_USERNAME'] = d['username']  #
environ['KAGGLE_KEY'] = d['key']

api = KaggleApi()
api.authenticate()

api.dataset_download_files(
    dataset='basilb2s/language-detection',
    path='./data',
    force=True,
    unzip=False
)


In [4]:
from zipfile import ZipFile
import pandas as pd

with ZipFile('./data/language-detection.zip') as z:
    filename = z.namelist()[0]
    with z.open(filename) as f:
        df = pd.read_csv(filepath_or_buffer = f)

df

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English
...,...,...
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada


In [5]:
X = df['Text'].values
Y = df['Language'].values

In [6]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import  cross_validate ,StratifiedKFold

def preprocess(X):
    from string import punctuation
    table = str.maketrans('' ,'' ,'0123456789' + punctuation)
    helper = lambda x: ' '.join(x.lower().translate(table).split())
    return [helper(x) for x in X]

vectorizer = CountVectorizer(
    binary = True,
    min_df = 0.0,
    max_df = 1.0
)


model = ComplementNB(alpha = 1.0)

pipeline = Pipeline(
    steps = [
        ('preprocessor', FunctionTransformer(preprocess)),
        ('vectorizer', vectorizer),
        ('model',model)
        
    ]
)
d = cross_validate(
    estimator = pipeline,
    X = X,
    y = Y,
    scoring = 'accuracy',
    cv = StratifiedKFold(n_splits = 5 , shuffle = True),
    return_train_score = True
)


d

{'fit_time': array([0.35288119, 0.37082911, 0.3765595 , 0.34886384, 0.35148597]),
 'score_time': array([0.06306458, 0.06558561, 0.06250024, 0.06249785, 0.06254411]),
 'test_score': array([0.98259188, 0.9787234 , 0.97871311, 0.98597   , 0.97822932]),
 'train_score': array([0.99516266, 0.99552546, 0.99528416, 0.99528416, 0.99588875])}

In [7]:
pipeline.fit(X, Y)

In [8]:
_ = pipeline.steps.pop(0)
pipeline

In [13]:
from joblib import dump

dump(pipeline, '../api/model/pipeline-0.1.0.pkl')

['../api/model/pipeline-0.1.0.pkl']

In [14]:
from joblib import load 

model = load('../api/model/pipeline-0.1.0.pkl')
model.predict(preprocess(['guten tag!']))[0]

'German'

In [3]:
import joblib , pandas , sklearn
joblib.__version__ , pandas.__version__ , sklearn.__version__ 

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import joblib , pandas , sklearn


('1.3.2', '2.2.0', '1.4.0')