DATA INGESTION

In [6]:
import pandas as pd

# Specify the path to your CSV file
file_path = "../Data_Science_Challenge_Data - Sheet1.csv"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

PREPROCESS DATA

In [7]:
import pandas as pd
from gensim.models import Word2Vec
import nltk
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from keras.preprocessing.sequence import pad_sequences


2024-02-04 22:09:58.421235: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
# nltk.download('punkt')
nltk.download("stopwords")
stop_words = set(nltk.corpus.stopwords.words("english"))

# Replace NaN values in 'technologies' column
df.fillna("", inplace=True)
# df['technologies'] = df['technologies'].replace('nan', '')

# Encode the 'type' column
type_map = {"B2B": 0, "B2C": 1, "B2B B2C": 2, "None": 3}
df["type_encoded"] = df["type"].map(type_map)


# Tokenize and preprocess
embedding_columns = [
    "technologies",
    "specialties",
    "company_hubs",
    "industry",
    "categories",
]

# convert size to small medium big to test it

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/filipe.marques/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df.fillna("", inplace=True)


FEATURE ENGINEERING

In [9]:
def create_embeddings(
    df,
    column_names,
    vector_size=5,
    window=3,
    min_count=1,
    workers=4,
    max_sequence_length=5,
):
    def _preprocess(entry):
        words = [word.strip().lower() for word in entry.split(",")]
        filtered_words = [word for word in words if word not in stop_words]
        return filtered_words

    # Tokenize and preprocess for all specified columns
    for column_name in column_names:
        df[f"tokenized_{column_name}"] = df[column_name].apply(_preprocess)

    # Combine all tokenized columns into a single list of sentences
    all_sentences = df[
        [f"tokenized_{column_name}" for column_name in column_names]
    ].values.flatten()

    # Create Word2Vec model
    model = Word2Vec(
        sentences=all_sentences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=workers,
    )

    for column_name in column_names:
        embeddings = df[f"tokenized_{column_name}"].apply(
            lambda entry: [model.wv[word] for word in entry if word in model.wv]
        )

        # Pad sequences
        padded_embeddings = pad_sequences(
            embeddings,
            maxlen=max_sequence_length,
            dtype="float32",
            padding="post",
            truncating="post",
        )

        # Flatten the embeddings
        flattened_embeddings = pd.DataFrame(
            padded_embeddings.reshape(-1, vector_size * max_sequence_length),
            columns=[
                f"{column_name}_feature_{i+1}"
                for i in range(vector_size * max_sequence_length)
            ],
        )

        # Concatenate the new DataFrame with your original DataFrame
        df = pd.concat([df, flattened_embeddings], axis=1)

    return df, model

In [10]:
df, embedings_model = create_embeddings(df, embedding_columns)

In [11]:
import joblib

# Save Word2Vec model with the current date in the filename
model_filename = f"../saved_models/word2vec_model_latest.pkl"

joblib.dump(embedings_model, model_filename)

['../saved_models/word2vec_model_latest.pkl']

In [12]:
# Define features (X) and target (y)
X = df[[f"{column}_feature_{i+1}" for column in embedding_columns for i in range(25)]]
y = df["type_encoded"]

# Split the data into training and evaluation sets
X_train, X_eval, y_train, y_eval = train_test_split(
    X, y, test_size=0.2, random_state=42
)

ML MODEL TRAIN

In [13]:
from sklearn.svm import SVC

# Initialize the SVM model
svm_model = SVC(kernel="linear", C=1.0)

# Train the SVM model on the training set
svm_model.fit(X_train, y_train)

# Predictions on the evaluation set
y_pred = svm_model.predict(X_eval)

# Evaluate the model
accuracy = accuracy_score(y_eval, y_pred)
classification_rep = classification_report(y_eval, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)

Accuracy: 0.7416666666666667
Classification Report:
              precision    recall  f1-score   support

           0       0.74      1.00      0.85       267
           1       0.00      0.00      0.00        33
           2       0.00      0.00      0.00        60

    accuracy                           0.74       360
   macro avg       0.25      0.33      0.28       360
weighted avg       0.55      0.74      0.63       360



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
from sklearn.svm import SVC
from sklearn.utils import class_weight
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Initialize the SVM model
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
svm_model = SVC(class_weight=dict(enumerate(class_weights)))

# Set up parameter grid for grid search
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear', 'rbf', 'poly']}

# Initialize StandardScaler
scaler = StandardScaler()

# Initialize the GridSearchCV
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Scale the data
X_train_scaled = scaler.fit_transform(X_train)
X_eval_scaled = scaler.transform(X_eval)

# Perform grid search
grid_search.fit(X_train_scaled, y_train)

# Get the best model from grid search
best_svm_model = grid_search.best_estimator_

# Predictions on the evaluation set (use the scaled data)
y_pred = best_svm_model.predict(X_eval_scaled)

# Evaluate the model
accuracy = accuracy_score(y_eval, y_pred)
classification_rep = classification_report(y_eval, y_pred)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)


Best Parameters: {'C': 0.1, 'kernel': 'poly'}
Accuracy: 0.7305555555555555
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.96      0.85       267
           1       0.14      0.03      0.05        33
           2       0.38      0.10      0.16        60

    accuracy                           0.73       360
   macro avg       0.43      0.36      0.35       360
weighted avg       0.64      0.73      0.66       360



In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils import class_weight
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Assuming you have X and y defined
# Split the data into training and evaluation sets
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize StandardScaler
scaler = StandardScaler()

# Scale the data
X_train_scaled = scaler.fit_transform(X_train)
X_eval_scaled = scaler.transform(X_eval)

# Initialize the Random Forest model with class weights
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
rf_model = RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42)

# Set up parameter distributions for randomized search
param_dist = {
    'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10, 20, 30],
    'min_samples_leaf': [1, 2, 4, 8]
}

# Initialize RandomizedSearchCV
rf_random_search = RandomizedSearchCV(rf_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)

# Perform randomized search to find optimal hyperparameters
rf_random_search.fit(X_train_scaled, y_train)

# Get the best Random Forest model from the search
best_rf_model = rf_random_search.best_estimator_

# Predictions on the evaluation set (use the scaled data)
y_pred_rf = best_rf_model.predict(X_eval_scaled)

# Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_eval, y_pred_rf)
classification_rep_rf = classification_report(y_eval, y_pred_rf)

# Print results
print("Random Forest Model:")
print(f"Best Hyperparameters: {rf_random_search.best_params_}")
print(f"Accuracy: {accuracy_rf}")
print("Classification Report:")
print(classification_rep_rf)


Random Forest Model:
Best Hyperparameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 30}
Accuracy: 0.8333333333333334
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.99      0.90       267
           1       0.60      0.09      0.16        33
           2       1.00      0.53      0.70        60

    accuracy                           0.83       360
   macro avg       0.81      0.54      0.58       360
weighted avg       0.83      0.83      0.80       360



In [16]:
print(y_pred)

[0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 2 0 0 0 0 0 0 2 0 0 0 2 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [17]:
import joblib

# Save the trained SVM model to a file
model_filename = "../saved_models/rf_model_latest.pkl"
joblib.dump(best_rf_model, model_filename)

print(f"Model saved as {model_filename}")

Model saved as ../saved_models/rf_model_latest.pkl
