In [4]:
import pandas as pd

# Loading the data into a DataFrame using pandas
dataset = pd.read_csv(r"layoffs.csv")

# Exploring the data
print(dataset.head(3))
print("Get information about data types and missing values\n", dataset.info())  # Get information about data types and missing values

     company       location industry  total_laid_off  percentage_laid_off  \
0  Atlassian         Sydney    Other           500.0                 0.05   
1   SiriusXM  New York City    Media           475.0                 0.08   
2     Alerzo         Ibadan   Retail           400.0                  NaN   

       date     stage        country  funds_raised_millions  
0  3/6/2023  Post-IPO      Australia                  210.0  
1  3/6/2023  Post-IPO  United States                  525.0  
2  3/6/2023  Series B        Nigeria                   16.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2361 entries, 0 to 2360
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   company                2361 non-null   object 
 1   location               2361 non-null   object 
 2   industry               2357 non-null   object 
 3   total_laid_off         1621 non-null   float64
 4   percentage_laid_off 

Handling Missing Values

In [5]:
print("Null Values per Column before handling them:\n", (dataset.isnull().sum() / dataset.shape[0]) * 100)
# none are above 50% so we can choose to fill them using mean mode or other methods

Null Values per Column before handling them:
 company                   0.000000
location                  0.000000
industry                  0.169420
total_laid_off           31.342651
percentage_laid_off      33.248623
date                      0.042355
stage                     0.254130
country                   0.000000
funds_raised_millions     8.852181
dtype: float64


In [6]:
# Filling in categorical columns
dataset['industry'] = dataset['industry'].fillna(dataset['industry'].mode()[0])
dataset['stage'] = dataset['stage'].fillna(dataset['stage'].mode()[0])

# Forward filling the dates
dataset['date'] = dataset['date'].fillna(method='ffill')

# Filling in numerical columns
dataset['percentage_laid_off'] = dataset['percentage_laid_off'].fillna(dataset['percentage_laid_off'].mean())  # it is normally distributed with little chances of outliers so we use mean
dataset['total_laid_off'] = dataset['total_laid_off'].fillna(dataset['total_laid_off'].median())  # it is not normally distributed and thus used median
dataset['funds_raised_millions'] = dataset['funds_raised_millions'].fillna(dataset['funds_raised_millions'].median())  # it is not normally distributed and thus used median

In [7]:
print("Null Values per Column After handling them:\n", (dataset.isnull().sum() / dataset.shape[0]) * 100)

Null Values per Column After handling them:
 company                  0.0
location                 0.0
industry                 0.0
total_laid_off           0.0
percentage_laid_off      0.0
date                     0.0
stage                    0.0
country                  0.0
funds_raised_millions    0.0
dtype: float64


REPLACING AND HANDLING DATATYPES

In [8]:
dataset['date'] = pd.to_datetime(dataset['date'])  # Convert the date column to datetime type

HANDLING DUPLICATES

In [10]:
dataset.drop_duplicates(inplace=True)

ENCODING THE DATASET

In [11]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply Label Encoding to categorical columns
categorical_columns = ['company', 'location', 'industry', 'country', 'stage']
for column in categorical_columns:
    dataset[column] = label_encoder.fit_transform(dataset[column])


In [12]:
print(dataset.describe())
print(dataset.shape)
print("Cleaned data now saved to a file.")

dataset.to_csv('cleaned_data.csv', index=False)

           company     location     industry  total_laid_off  \
count  2356.000000  2356.000000  2356.000000     2356.000000   
mean    953.183786   113.582343    16.975382      187.936757   
min       0.000000     0.000000     0.000000        3.000000   
25%     474.750000    83.750000    10.000000       50.000000   
50%     956.500000   131.000000    16.000000       80.000000   
75%    1432.250000   148.000000    24.250000      110.000000   
max    1892.000000   190.000000    31.000000    12000.000000   
std     546.992207    52.956013     8.736010      641.849448   

       percentage_laid_off                           date        stage  \
count          2356.000000                           2356  2356.000000   
mean              0.257994  2022-02-02 16:48:29.337860864     6.138370   
min               0.000000            2020-03-11 00:00:00     0.000000   
25%               0.130000            2020-08-19 00:00:00     1.000000   
50%               0.257917            2022-08-01 00:0