# Industry Classification based on Business Description Embeddings

In [2]:
import numpy as np
import pandas as pd
import ast

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
import seaborn as sns

# Loading Data

In [3]:
df = pd.read_pickle("../data/df_train.pkl")

df_train, df_val = train_test_split(df, test_size= 0.1, random_state=42)

In [4]:
X_train = df_train.business_description_embedding.apply(ast.literal_eval).tolist()
X_train = np.array(X_train)

X_val = df_val.business_description_embedding.apply(ast.literal_eval).tolist()
X_val = np.array(X_val)

In [5]:
df_train.head(3)

Unnamed: 0,id,industry,business_description_embedding
31138,34170,Financial Services,"[0.019633012,0.009427597,0.006240986,0.0177257..."
15120,16504,Technology Hardware & Equipment,"[0.026561439,-0.049749356,-0.006847293,-0.0088..."
23693,25893,Consumer Durables & Apparel,"[0.0115894,-0.035240766,-0.029952582,0.0226988..."


# Initial Data Analysis

In [6]:
#exploring the dataset
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 31706 entries, 38532 to 15797
Data columns (total 3 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   id                              31706 non-null  int64 
 1   industry                        31706 non-null  object
 2   business_description_embedding  31706 non-null  object
dtypes: int64(1), object(2)
memory usage: 990.8+ KB


In [7]:
df.head()

Unnamed: 0,id,industry,business_description_embedding
38532,42576,Banks,"[0.03411475,-0.03753958,0.023233214,-0.0017205..."
19838,21752,Materials,"[-0.025557408,0.0636078,-0.01350581,-0.0180223..."
904,804,Commercial & Professional Services,"[0.033574868,-0.020379327,-0.03221274,-0.01133..."
28806,31603,Insurance,"[0.020648101,0.017270269,-0.032921147,-0.01411..."
10472,11333,Capital Goods,"[-0.0006651232,0.011097765,-0.01976973,0.02299..."


In [8]:
df.tail()

Unnamed: 0,id,industry,business_description_embedding
6267,6403,Capital Goods,"[0.022082254,-0.015513622,-0.029084805,0.02645..."
11286,12224,Capital Goods,"[-0.031132907,-0.049892895,-0.02100823,0.03896..."
38170,42174,Transportation,"[-0.025557408,0.0636078,-0.01350581,-0.0180223..."
860,754,Technology Hardware & Equipment,"[0.04300068,-0.111498445,-0.0044387826,-0.0004..."
15797,17253,Consumer Services,"[0.067711435,-0.0050259987,-0.016451076,-0.005..."


In [9]:
#show me all values in column 'industry'
df['industry'].value_counts()

industry
Materials                                         8394
Capital Goods                                     3127
Financial Services                                1936
Pharmaceuticals, Biotechnology & Life Sciences    1925
Technology Hardware & Equipment                   1547
Software & Services                               1537
Energy                                            1326
Consumer Durables & Apparel                       1164
Food, Beverage & Tobacco                          1094
Health Care Equipment & Services                  1014
Banks                                              948
Media & Entertainment                              915
Real Estate Management & Development               906
Commercial & Professional Services                 804
Consumer Services                                  672
Consumer Discretionary Distribution & Retail       658
Automobiles & Components                           625
Semiconductors & Semiconductor Equipment           607
T

In [10]:
df.isnull().sum()

id                                0
industry                          0
business_description_embedding    0
dtype: int64

In [11]:
df.nunique()

id                                31705
industry                             25
business_description_embedding    27128
dtype: int64

In [15]:
# to see the financial data present in df_financials_train.pkl
df = pd.read_pickle("../data/df_financials_train.pkl")
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29561 entries, 0 to 38419
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 29561 non-null  int64  
 1   country_code       29561 non-null  object 
 2   net_profit_margin  28450 non-null  float64
 3   ebitda_margin      10282 non-null  float64
 4   asset_turnover     28198 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 1.4+ MB


In [16]:
df.head()

Unnamed: 0,id,country_code,net_profit_margin,ebitda_margin,asset_turnover
0,0,CHN,0.303422,,0.025426
1,1,CHN,-0.141921,-0.130371,0.266802
2,2,CHN,-1.333215,,0.317023
3,3,CHN,-0.258595,,0.352064
4,4,CHN,0.183619,,0.963834


# Transforming Data (Label Encoding)

### Tasks:
- Use the scikit-learn label encoder to encode the industry names
- Check if all classes contained in the validation set are also in the training set

#

In [147]:
# Fit the label encoder to the classes (industry names)

In [146]:
y_train = None # encode labels
y_val =  None # encode labels

# Visualize the data

### Tasks:
- Are certain classes over- or under represented? Either produce a table or a plot to show this.
- Inspect whether there is signal in the business description embeddings:
    - Perform a PCA to project data into 2 dimensions
    - Plot projected data in Scatterplot and color based on classes
    - Provide a description of what you see and judge whether there is signal in the data that allows industry classification

Important: Ensure that your plots have proper axis descriptions and titles. Style the plots so that differences in class distributions are visible (e.g. scatter size, transparency, color, etc.)

### Class distribution

In [125]:
# Plot the class distribution or provide a table that shows how many times each class (industry) appears
# Describe your findings

### PCA - Dimensionality reduction and visualization

In [126]:
# Performa a PCA and plot the projected data. Color the scatter plot based on the classes
# Analyse what you see

# Fitting and comparing Classifier Models

### Tasks:
- Split the data into train and validation data
- Encode the industry labels using LabelEncoder (scikit-learn)
- Fit a LogisticRegression and a kNN-classifier
- Compare the model performance of both models:
    - Compute Accuracy and F1 score
        - Interpret the scores: Explain how they are computed and judge if your model performs well
        - Analyze the classification errors: 
            - Do the errors correlate with how well classes are represented?
            - Which industries does the model identify well and which seem to be similar?
    - Plot a confusion matrix for both models (combine scikit-learn confusion matrix and seaborn heatmap plot)
    - Do both models misclassify the same examples?

Import: Use proper axis labels for the plots! 

In [148]:
# your code

# Optional: Confidence Weighted Prediction

## Deliverables:

- Provide a notebook with the implementation and training of a industry classifier model
- The model shall output the industry classification and its confidence as a tuple of vectors $(\hat{y}_{pred}, \hat{y}_{confidence})$
- The confidence score must be between 0 and 1, $\hat{y}_{confidence} [i] \in [0,1]$
- Your model will be evaluated on a private test set
- The designing the confidence score is your task. You may use p-values, a voting mechanism of multiple models, or other techniques
- Another option is to add more features, e.g. financial data, to X

In [139]:
def confidence_weighted_f1(y_true, y_pred, confidence):
    # Weight all predictions, not just positives
    weighted_tp = np.sum(confidence * (y_pred == 1) * (y_true == 1))
    weighted_fp = np.sum(confidence * (y_pred == 1) * (y_true == 0))
    weighted_fn = np.sum(confidence * (y_pred == 0) * (y_true == 1))
    
    precision = weighted_tp / (weighted_tp + weighted_fp) if (weighted_tp + weighted_fp) > 0 else 0
    recall = weighted_tp / (weighted_tp + weighted_fn) if (weighted_tp + weighted_fn) > 0 else 0
    
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1