# Vectorisation de type TF-IDF sur les données textuelles

## Import des librairies et des data

### Import des librairies

In [1]:
import numpy as np
import pandas as pd
import os
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
DATA_DIR = os.path.join(BASE_DIR, 'data')
print(BASE_DIR, DATA_DIR)
os.chdir(BASE_DIR)
from src.features.text.transformers.text_merger import TextMerger
from src.features.text.transformers.extractors import YearExtractor, NumberExtractor, HashtagNumberExtractor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier

c:\Users\mangg\projects\RakutenTeam c:\Users\mangg\projects\RakutenTeam\data


### Import des données

In [None]:
df = pd.read_csv(os.path.join(DATA_DIR, "clean/X_train.csv"), index_col=0)
target = pd.read_csv(os.path.join(DATA_DIR, "clean/Y_train.csv"), index_col=0)["prdtypecode"]

## Feature Engineering

### Text Merging

In [None]:

merger = TextMerger(designation_column="designation", description_column="description", merged_column="full_description")
merged_text = merger.fit_transform(df)
df["full_description"] = merged_text

### TF-IDF Vectorization

In [None]:
vectorizer = TfidfVectorizer(max_features=1000)
vectorized_text = pd.DataFrame(data=vectorizer.fit_transform(df.full_description).toarray(), columns=vectorizer.get_feature_names_out(), index=df.index)
vectorized_text.head()

### Extracting 'N°' info

In [None]:
number_extractor = NumberExtractor(text_column="full_description")
numbers = number_extractor.fit_transform(df)
numbers.head()

### Extracting year info

In [None]:
year_extractor = YearExtractor(text_column="full_description")
years = year_extractor.fit_transform(df)
years.head()


### Extracting Hashtag number info

In [None]:
hashtags_extractor = HashtagNumberExtractor(text_column="full_description")
hashtags = hashtags_extractor.fit_transform(df)
hashtags.head()

### Merging and Scaling extracted features

In [None]:
extracted_features = pd.concat([numbers, years, hashtags], axis=1)
extracted_features.describe().round(3)

In [None]:
scaler = MinMaxScaler()
scaled_features = pd.DataFrame(data=scaler.fit_transform(extracted_features), columns=extracted_features.columns, index=extracted_features.index)
scaled_features.describe().round(3)

### Features Merging

In [None]:
data = pd.concat([vectorized_text, scaled_features], axis=1)
data.head()

## Model Selection

### Separating Training Set and Test Set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target.values, test_size=0.2, random_state=42)

### Defining models to experiment with

In [None]:
clf_svc = SVC(random_state=42)
clf_rf = RandomForestClassifier(random_state=42)
clf_lr = LogisticRegression(random_state=42)
clf_knn = KNeighborsClassifier()
clf_dum = DummyClassifier(random_state=42)

### Défining Param Grids for each classifier

In [None]:
svc_params = {"C": [0.1, 1, 10, 100], "gamma": [1, 0.1, 0.01, 0.001], "kernel": ["rbf", "poly", "sigmoid"]}
rf_params = {"n_estimators": [10, 100, 1000], "max_depth": [None, 5, 10, 20, 30], "min_samples_split": [2, 5, 10], "min_samples_leaf": [1, 2, 4]}
lr_params = {"C": [0.1, 1, 10, 100], "penalty": ["l1", "l2"]}
knn_params = {"n_neighbors": [3, 5, 11, 19], "weights": ["uniform", "distance"], "metric": ["euclidean", "manhattan"]}
dum_params = {"strategy": ["stratified", "most_frequent", "prior", "uniform"]}

### Setting classifiers tab

In [None]:
classifiers = [
    ("Dummy", clf_dum, dum_params),
    ("SVC", clf_svc, svc_params),
    ("Random Forest", clf_rf, rf_params),
    ("Logistic Regression", clf_lr, lr_params),
    ("KNN", clf_knn, knn_params),
    
]

### Setting results tab

In [None]:
results = pd.DataFrame(columns=["Accuracy", "Best params"], index=["Dummy", "SVC", "Random Forest", "Logistic Regression", "KNN"])
best_models = []
best_params = []

### Exploring Grid Search CV for different models

In [None]:
for name, clf, params in classifiers:
    print (f"Training {name}...")
    grid = GridSearchCV(clf, params, cv=5, n_jobs=3, verbose=1, scoring="accuracy")
    grid.fit(X_train, y_train)
    print(f"Meilleurs paramètres pour {name}: {grid.best_params_}")
    print(f"Meilleur score pour {name}: {grid.best_score_:.3f}")
    test_score = grid.score(X_test, y_test)
    print(f"Score sur le test set pour {name}: {test_score:.3f}")
    results.loc[name, "Accuracy"] = test_score
    best_models.append({name: grid.best_estimator_})
    best_params.append({name: grid.best_params_})

### Display Results

In [None]:
results

### Analysing Results of best estimator