In [25]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

In [26]:
data = pd.read_csv('Credit_score_data.csv')
print(data.shape)
data.head()

(50000, 29)


Unnamed: 0,industry,credit_score,year,turnover,year_creation,total_actif,id,total_asset,net_total_asset,margin,...,stable_debt,financial_debt,tot_debt,investment,ppe,intangible,ppent,sales,capital,nbemp
0,47,3,2016,1.793,2006.0,1951,1479243776825,2120.0,1951.0,0.588608,...,335.0,336.0,560.0,7.0,217.0,0,380.0,1793.0,1588.0,4.0
1,68,6,2012,0.037,2003.0,365,1353691776396,530.0,365.0,0.0,...,0.0,592.0,603.0,0.0,508.0,0,673.0,37.0,508.0,0.0
2,47,5+,2016,1.899,2004.0,1730,1359587200964,1805.0,1730.0,0.0,...,180.0,206.0,423.0,0.0,76.0,0,151.0,1899.0,1373.0,0.0
3,55,4,2004,1.085,2000.0,1945,1296533120972,2559.0,1945.0,0.0,...,888.0,1235.0,1499.0,13.0,1587.0,0,2061.0,1085.0,2018.0,3.0
4,59,5+,2012,1.35,1996.0,2372,1229382656899,3020.0,2372.0,0.25,...,57.0,320.0,915.0,26.0,113.0,0,197.0,1350.0,2093.0,8.0


In [28]:
data = data.dropna()
print('After removing null values',data.shape)

After removing null values (49846, 29)


### Dataset Variable Descriptions

- **industry**:  
  A numeric code representing the industry classification or sector to which the company belongs. This identifier helps group companies by their business domain.

- **credit_score**:  
  The credit score rating assigned to the company. This score may be used to assess creditworthiness or risk level.

- **year**:  
  The calendar year in which the credit score was recorded. This helps track changes in credit performance over time.

- **turnover**:  
  The turnover value of the company. This might represent total revenue or sales volume, typically scaled (e.g., in millions).

- **year_creation**:  
  The year when the company was founded or established. This provides context regarding the company’s operational history.

- **total_actif**:  
  The total active assets or overall asset value of the company, indicating the total resources available (may be reported in thousands or millions).

- **id**:  
  A unique identifier for the company within the dataset. This can be used for indexing or merging with other datasets.

- **total_asset**:  
  The total asset value of the company, including both tangible and intangible assets, expressed in monetary terms.

- **net_total_asset**:  
  The net total assets after any deductions (such as depreciation), reflecting the company’s effective asset base.

- **margin**:  
  The profit margin or similar financial ratio, likely expressed as a decimal. This can indicate the company's profitability relative to its revenue.

- **wcr**:  
  Working Capital Requirement, which measures the short-term liquidity needs of the company. Negative values may indicate a net source of working capital.

- **sfc**:  
  Likely representing the company’s Self-Financing Capacity, which indicates the internal funds available for investment or operations.

- **bank_debt**:  
  The amount of debt owed to banks, representing financial obligations that may be short- or long-term.

- **bank_debt_lt**:  
  The portion of bank debt classified as long-term. This helps differentiate between short-term and long-term liabilities.

- **other_debt**:  
  Debt obtained from sources other than banks. A value of 0 suggests no additional external borrowing beyond bank loans.

- **tot_debt1**:  
  Total debt calculated using one method or aggregation, summing up various debt components.

- **tot_debt2**:  
  An alternative calculation of total debt, possibly using different accounting treatments or components.

- **equity1**:  
  The company’s shareholders’ equity computed by one method. This represents the net assets available to shareholders.

- **equity2**:  
  An alternative measure of shareholders’ equity, which may include different adjustments or exclusions.

- **stable_debt**:  
  The portion of debt considered stable (often long-term or less volatile), indicating reliable financing for long-term investments.

- **financial_debt**:  
  The overall financial debt of the company, encompassing both bank and other forms of financing.

- **tot_debt**:  
  The aggregate total debt of the company. This value summarizes all forms of debt, providing a snapshot of overall leverage.

- **investment**:  
  The amount invested in the company’s operations or capital expenditures. This may include investments in fixed assets or expansion initiatives.

- **ppe**:  
  Property, Plant, and Equipment (PPE) refers to the tangible, long-term assets used in the company’s operations.

- **intangible**:  
  The value of intangible assets such as patents, trademarks, or goodwill. A value of 0 indicates no recorded intangible assets.

- **ppent**:  
  Net PPE (Property, Plant, and Equipment after depreciation), providing insight into the current value of the company’s physical assets.

- **sales**:  
  The total sales revenue generated by the company, which is a key indicator of market performance and business activity.

- **capital**:  
  The capital invested in the company, which may include both equity capital and retained earnings reinvested into the business.

- **nbemp**:  
  The number of employees working for the company. This can be used as a proxy for the company’s size and operational scale.


In [29]:
data = data.drop(['id'], axis=1)

def set_creditscore(value):
    d = {'3': 3.0, '6':6.0, '5+':5.4, '4':4.0, '4+':4.4,
          '3+':3.4, '5':5.0, '7':7.0, '3++':3.8, 'P':0, '8':8.0, '9':9.0}
    return d[value]
data['credit_score'] = data['credit_score'].apply(set_creditscore)
    

In [30]:
def encode_variable(df, column):
    # Get sorted unique years from the column
    unique_values = sorted(df[column].unique())
    # Create a mapping from each year to an integer index
    mapping = {year: idx for idx, year in enumerate(unique_values)}
    # Create a new column with the encoded values
    df[column] = df[column].map(mapping)
    return df

data = encode_variable(data, 'year')
data = encode_variable(data, 'industry')

In [32]:
data = data.sample(frac=1, random_state=23).reset_index(drop=True)
data_train, data_test = data.iloc[0:40000], data.iloc[40000:]

data_train.to_csv('cleaned_data_train.csv', index=False)
data_test.to_csv('cleaned_data_test.csv', index=False)