<a href="https://colab.research.google.com/github/JReal10/project_moon/blob/main/Analyze.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Predicting Unicorn Success: Analyzing Factors Influencing Valuation Over $5 Billion**
------------
### A Data-Driven Approach to Understanding Start-Up Growth
----------
- Jamie Ogunidran
- 25/10/2024

## **Purpose:.**
------------
- Content to Include:
- Brief overview of the project.
- Key methodologies used.
- Summary of major findings.
- Main conclusions and recommendations.

## Key Findings
---

## Conclusion
---

## **Methodology**  
------------
### 1.Data Preparation & Collection
Cleaned and preprocessed data to ensure accuracy and consistency for analysis.

### 2.Exploratory Data Analysis
Explored key variables and relationships to uncover patterns and guide further analysis.

### 3.Data Visualization
Created visuals to illustrate trends, with annotations for notable outliers and patterns.


### 4.Predictive Modelling
Built and evaluated models to predict company valuations over $5 billion, using selected features and performance metrics.

## Data Collection & Preparation
------------
Source data: [link to source data](https://www.cbinsights.com/research-unicorn-companies)

In [None]:
# Import necessary libraries
import polars as pl # DataFrame library
import numpy as np
import os

In [None]:
def load_data(file_path) -> pl.DataFrame:
  """
  Load data from a CSV file into a Polars DataFrame.
  """
  # Check if the file exists before reading it
  if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found at {file_path}")

  return pl.read_csv(file_path)

In [None]:
def clean_data(data: pl.DataFrame) -> pl.DataFrame:
    """
    Clean the data by removing any missing values, cleaning the 'Valuation ($B)' column, and converting 'Date Joined' to date format.
    """
    # Remove any missing values
    data = data.drop_nulls()

    # Clean the 'Valuation ($B)' column
    data = data.with_columns([
        pl.col('Valuation ($B)')
        .str.replace_all(r'\$', '')  # Remove $ sign
        .str.replace_all(r'[^\d\.]', '')  # Remove non-numeric characters except period
        .cast(pl.Float32),  # Convert to float
    ])

    # Convert 'Date Joined' column to date format (assuming 'DD/MM/YYYY')
    data = data.with_columns([
        pl.col('Date Joined').str.strptime(pl.Date, format="%d/%m/%Y")
    ])

    # Extract the year from the 'Date Joined' column
    data = data.with_columns(
        pl.col("Date Joined").dt.year().alias("Year Joined")
    )

    # Extract the month from the 'Date Joined' column
    data = data.with_columns(
        pl.col("Date Joined").dt.month().alias("Month Joined")
    )

    return data

In [None]:
def better_column_format(data):
  """
  Make the readability of the data better by replacing spaces and lowering the character of the column
  """
  data = data.rename({col: col.lower() for col in data.columns})
  data = data.rename({col: col.replace(" ", "_") for col in data.columns})

  return data

In [None]:
def save_data(data, name):
    """
    Saving processed data into a new csv file
    """
    # Join the directory and file name into a valid path
    data_path = os.path.join('/content/transformed_data', f'{name}.csv')
    # Save the dataframe as a CSV file
    data.write_csv(data_path)

In [None]:
# Saving Cleaned data to a new csv file
file_path = "/content/CB-Insights_Global-Unicorn-Club_2024.csv"
df = load_data(file_path)
cleaned_df = clean_data(df)
cleaned_df = better_column_format(cleaned_df)

#save_data(cleaned_df, 'cleaned_data')

In [None]:
cleaned_df.head()

company,valuation_($b),date_joined,country,city,industry,select_investors,year_joined,month_joined
str,f32,date,str,str,str,str,i32,i8
"""ByteDance""",225.0,2017-04-07,"""China""","""Beijing""","""Media & Entertainment""","""Sequoia Capital China, SIG Asi…",2017,4
"""SpaceX""",200.0,2012-12-01,"""United States""","""Hawthorne""","""Industrials""","""Founders Fund, Draper Fisher J…",2012,12
"""OpenAI""",80.0,2019-07-22,"""United States""","""San Francisco""","""Enterprise Tech""","""Khosla Ventures""",2019,7
"""Stripe""",70.0,2014-01-23,"""United States""","""San Francisco""","""Financial Services""","""Khosla Ventures, LowercaseCapi…",2014,1
"""Revolut""",45.0,2018-04-26,"""United Kingdom""","""London""","""Financial Services""","""Index Ventures, DST Global, Ri…",2018,4


## Exploratory Data Analysis
-----

In [None]:
# Set-up
%load_ext sql
%sql sqlite://
import pandas as pd

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [None]:
with open('cleaned_data.csv') as f: unicorn_table = pd.read_csv('/content/cleaned_data.csv', encoding='unicode_escape', index_col=0)
%sql drop table if exists unicorn_table;
%sql --persist unicorn_table

 * sqlite://
Done.
 * sqlite://


'Persisted unicorn_table'

In [None]:
%sql ALTER TABLE unicorn_table RENAME COLUMN "valuation_($b)" TO valuation_b;

 * sqlite://
Done.


[]

###General Analysis
- Engineering features by adding new category of continent into the table.
- The new category includes 7 continent in total - Asia, Africa, North America, South America, Europe, Oceania and Other.


In [None]:
%%sql
CREATE TABLE unicorn_table_f AS
SELECT
    *,
    valuation_b,
    (LENGTH(select_investors) - LENGTH(REPLACE(select_investors, ',', '')) + 1) AS investor_count,
    CASE
        -- Africa
        WHEN country IN ('Algeria', 'Angola', 'Benin', 'Botswana', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cameroon', 'Central African Republic', 'Chad', 'Comoros', 'Congo, Democratic Republic of the', 'Congo, Republic of the', 'Djibouti', 'Egypt', 'Equatorial Guinea', 'Eritrea', 'Eswatini', 'Ethiopia', 'Gabon', 'Gambia', 'Ghana', 'Guinea', 'Guinea-Bissau', 'Ivory Coast', 'Kenya', 'Lesotho', 'Liberia', 'Libya', 'Madagascar', 'Malawi', 'Mali', 'Mauritania', 'Mauritius', 'Morocco', 'Mozambique', 'Namibia', 'Niger', 'Nigeria', 'Rwanda', 'Sao Tome and Principe', 'Senegal', 'Seychelles', 'Sierra Leone', 'Somalia', 'South Africa', 'South Sudan', 'Sudan', 'Tanzania', 'Togo', 'Tunisia', 'Uganda', 'Zambia', 'Zimbabwe') THEN 'Africa'
        -- Asia
        WHEN country IN ('Afghanistan', 'Armenia', 'Azerbaijan', 'Bahrain', 'Bangladesh', 'Bhutan', 'Brunei', 'Cambodia', 'China', 'Cyprus', 'Georgia', 'India', 'Indonesia', 'Iran', 'Iraq', 'Israel', 'Japan', 'Jordan', 'Kazakhstan', 'Kuwait', 'Kyrgyzstan', 'Laos', 'Lebanon', 'Malaysia', 'Maldives', 'Mongolia', 'Myanmar', 'Nepal', 'North Korea', 'Oman', 'Pakistan', 'Palestine', 'Philippines', 'Qatar', 'Saudi Arabia', 'Singapore', 'South Korea', 'Sri Lanka', 'Syria', 'Tajikistan', 'Thailand', 'Timor-Leste', 'Turkey', 'Turkmenistan', 'United Arab Emirates', 'Uzbekistan', 'Vietnam', 'Yemen') THEN 'Asia'
        -- Europe
        WHEN country IN ('Albania', 'Andorra', 'Austria', 'Belarus', 'Belgium', 'Bosnia and Herzegovina', 'Bulgaria', 'Croatia', 'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Italy', 'Kosovo', 'Latvia', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Malta', 'Moldova', 'Monaco', 'Montenegro', 'Netherlands', 'North Macedonia', 'Norway', 'Poland', 'Portugal', 'Romania', 'Russia', 'San Marino', 'Serbia', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Switzerland', 'Ukraine', 'United Kingdom', 'Vatican City') THEN 'Europe'
        -- North America
        WHEN country IN ('Antigua and Barbuda', 'Bahamas', 'Barbados', 'Belize', 'Canada', 'Costa Rica', 'Cuba', 'Dominica', 'Dominican Republic', 'El Salvador', 'Grenada', 'Guatemala', 'Haiti', 'Honduras', 'Jamaica', 'Mexico', 'Nicaragua', 'Panama', 'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Vincent and the Grenadines', 'Trinidad and Tobago', 'United States') THEN 'North America'
        -- Oceania
        WHEN country IN ('Australia', 'Fiji', 'Kiribati', 'Marshall Islands', 'Micronesia', 'Nauru', 'New Zealand', 'Palau', 'Papua New Guinea', 'Samoa', 'Solomon Islands', 'Tonga', 'Tuvalu', 'Vanuatu') THEN 'Oceania'
        -- South America
        WHEN country IN ('Argentina', 'Bolivia', 'Brazil', 'Chile', 'Colombia', 'Ecuador', 'Guyana', 'Paraguay', 'Peru', 'Suriname', 'Uruguay', 'Venezuela') THEN 'South America'
        ELSE 'Other'
    END AS continent
FROM
    unicorn_table
ORDER BY
    continent, country;


 * sqlite://
Done.


[]

In [None]:
%sql SELECT MAX(valuation_b) AS industry_count FROM unicorn_table;

 * sqlite://
Done.


industry_count
225.0


# Valuation Analysis

In [None]:
%%sql CREATE TABLE valuation_unicorn_table AS
SELECT
    country,
    city,
    ROUND(SUM(valuation_b), 2) AS total_valuation,
    ROUND(MAX(valuation_b), 2) AS max_valuation,
    ROUND(MIN(valuation_b), 2) AS min_valuation,
    ROUND(AVG(valuation_b), 2) AS avg_valuation,
    COUNT(*) AS company_count,
    ROUND(SUM(valuation_b) / (SELECT SUM(valuation_b) FROM unicorn_company) * 100.00, 3) AS percentage_of_total,
    (SELECT COUNT(DISTINCT company) FROM unicorn_company) AS total_company_count,
    (SELECT ROUND(AVG(valuation_b), 2) FROM unicorn_company) AS avg_valuation_b
FROM unicorn_company
GROUP BY country, city
ORDER BY total_valuation DESC
LIMIT 10;




 * sqlite://
Done.


country,city,total_valuation,max_valuation,min_valuation,avg_valuation,company_count,percentage_of_total,total_company_count,avg_valuation_b
United States,San Francisco,798.24,80.0,1.0,4.48,178,20.625,1220.0,3.0
China,Beijing,391.55,225.0,1.0,6.32,62,10.117,1220.0,3.0
United States,New York,284.91,13.3,1.0,2.39,119,7.362,1220.0,3.0
United States,Hawthorne,201.35,200.0,1.35,100.68,2,5.203,1220.0,3.0
United Kingdom,London,169.21,45.0,1.0,3.94,43,4.372,1220.0,3.0
India,Bengaluru,86.81,11.5,1.0,2.8,31,2.243,1220.0,3.0
China,Shanghai,81.98,17.0,1.0,2.28,36,2.118,1220.0,3.0
France,Paris,61.38,6.4,1.0,2.79,22,1.586,1220.0,3.0
United States,Palo Alto,60.62,9.2,1.0,2.89,21,1.566,1220.0,3.0
United States,Boston,58.0,7.4,1.0,2.9,20,1.499,1220.0,3.0


In [None]:
%%sql CREATE TABLE valuation_unicorn_continent_table AS
SELECT
    continent,
    ROUND(SUM(valuation_b), 2) AS total_valuation,
    ROUND(MAX(valuation_b), 2) AS max_valuation,
    ROUND(MIN(valuation_b), 2) AS min_valuation,
    ROUND(AVG(valuation_b), 2) AS avg_valuation,
    COUNT(*) AS company_count,
    ROUND(SUM(valuation_b) / (SELECT SUM(valuation_b) FROM unicorn_table_continent) * 100.00, 3) AS percentage_of_total,
    (SELECT COUNT(DISTINCT company) FROM unicorn_table_continent) AS total_company_count,
    (SELECT ROUND(AVG(valuation_b), 2) FROM unicorn_table_continent) AS avg_valuation_b
FROM unicorn_table_continent
GROUP BY continent
ORDER BY total_valuation DESC;


 * sqlite://
Done.


continent,total_valuation,max_valuation,min_valuation,avg_valuation,company_count,percentage_of_total,total_company_count,avg_valuation_b
North America,2315.4,200.0,1.0,3.31,699,59.826,1220.0,3.0
Asia,954.82,225.0,1.0,3.19,299,24.671,1220.0,3.0
Europe,482.87,45.0,1.0,2.81,172,12.477,1220.0,3.0
South America,49.44,5.25,1.0,2.06,24,1.277,1220.0,3.0
Oceania,42.24,25.4,1.0,4.69,9,1.091,1220.0,3.0
Africa,16.29,10.0,1.0,3.26,5,0.421,1220.0,3.0
Other,9.15,2.0,1.0,1.53,6,0.236,1220.0,3.0


#### Key Findings
- North America has the total valuation of 2315.4 billion USD, which is 59.8% of the total sum.
- Oceania
- The city with the biggest valuation is San Fransisco in the United States with the total valuation of 798.24 billion USD followed by Beijin in China with $391.55 billion
- The average valuation of all the unicorn company is 3 billion USD
- On average Oceania has the highest average valuation of a company

# Industry Analysis

In [None]:
%%sql
SELECT
industry,
ROUND(SUM(valuation_b),2) AS total_valuation,
ROUND(MAX(valuation_b),2) AS max_valuation,
ROUND(MIN(valuation_b),2) AS min_valuation,
ROUND(AVG(valuation_b),2) AS avg_valuation,
ROUND(SUM(valuation_b / (SELECT SUM(valuation_b) FROM unicorn_table) * 100.00),3) AS percentage_of_total
FROM unicorn_table
GROUP BY 1;

 * sqlite://
Done.


industry,total_valuation,max_valuation,min_valuation,avg_valuation,percentage_of_total
Consumer & Retail,529.47,31.0,1.0,2.61,13.681
Enterprise Tech,1191.76,80.0,1.0,3.03,30.793
Financial Services,715.4,70.0,1.0,3.37,18.485
Health,1.0,1.0,1.0,1.0,0.026
Healthcare & Life Sciences,260.9,12.9,1.0,2.27,6.741
Industrials,639.74,200.0,1.0,3.53,16.53
Insurance,49.93,5.0,1.0,2.08,1.29
Media & Entertainment,482.01,225.0,1.0,5.67,12.454


In [None]:
# Time Series industrry growth
%%sql
WITH summary AS (
    SELECT
        industry,
        strftime('%Y', date_joined) AS year,
        COUNT(company) AS company_count,
        ROUND(SUM(valuation_b), 2) AS total_valuation_b
    FROM unicorn_table
    GROUP BY industry, year
    ORDER BY industry, year
)
SELECT
    industry,
    year,
    total_valuation_b,
    CASE
        WHEN LAG(total_valuation_b) OVER(PARTITION BY industry ORDER BY year) IS NULL THEN '0%'
        ELSE ROUND(((total_valuation_b / LAG(total_valuation_b) OVER(PARTITION BY industry ORDER BY year)) - 1) * 100, 2) || '%'
    END AS growth_rate
FROM summary
ORDER BY industry, year;



 * sqlite://
Done.


industry,year,total_valuation_b,growth_rate
Consumer & Retail,2007,1.38,0%
Consumer & Retail,2012,33.0,2291.3%
Consumer & Retail,2014,11.4,-65.45%
Consumer & Retail,2015,21.97,92.72%
Consumer & Retail,2016,29.88,36.0%
Consumer & Retail,2017,49.83,66.77%
Consumer & Retail,2018,57.35,15.09%
Consumer & Retail,2019,60.1,4.8%
Consumer & Retail,2020,76.49,27.27%
Consumer & Retail,2021,121.09,58.31%


#### Key Findings
- Enterprise Tech holds 30% of the share in the unicorn companies followed by 18% in financial services and 16.5% in Industrials
- Enterprise Tech has a total valuation of 529.47 billion USD
- Media & Entertainment has the highest average valuation with 5.67 billion USD
- Enterprise Tech has the biggest increase in growth with a growth of 2435.6%. growing from a total valuation of 2.5 billion USD to 60.89 billion USD
- Cosumer & Retail had the second biggest growth with a growth rate of 612%. Growing from a valuation of 1.38 billion USD to 8.45 billion USD


# Investor Analysis

Correlation between Company's valuation and the number of investors.

In [None]:
%%sql
WITH temp AS (SELECT
    company,
    select_investors,
    valuation_b,
    (LENGTH(select_investors) - LENGTH(REPLACE(select_investors, ',', '')) + 1) AS investor_count
FROM
    unicorn_table)

SELECT
    (COUNT(*) * SUM(valuation_b * investor_count) - SUM(valuation_b) * SUM(investor_count)) /
    (SQRT(COUNT(*) * SUM(POWER(valuation_b, 2)) - POWER(SUM(valuation_b), 2)) *
     SQRT(COUNT(*) * SUM(POWER(investor_count, 2)) - POWER(SUM(investor_count), 2)))
    AS correlation_coefficient
FROM temp;


 * sqlite://
Done.


correlation_coefficient
0.0402456934628118


In [None]:
%%sql
CREATE TABLE unicorn_investors AS
WITH RECURSIVE initial_table AS (
    -- Assign row numbers as unique identifiers for each row in unicorn_company
    SELECT
        ROW_NUMBER() OVER () AS row_num,
        company,
        valuation_b,
        date_joined,
        country,
        city,
        industry,
        select_investors,
        year_joined,
        month_joined
    FROM unicorn_company
),
split_investors AS (
    -- Anchor member: Select the first investor from each row
    SELECT
        row_num,
        company,
        valuation_b,
        date_joined,
        country,
        city,
        industry,
        year_joined,
        month_joined,
        TRIM(SUBSTRING(select_investors FROM 1 FOR POSITION(',' IN select_investors || ',') - 1)) AS investor,
        SUBSTRING(select_investors FROM POSITION(',' IN select_investors || ',') + 1) AS rest
    FROM initial_table

    UNION ALL

    -- Recursive member: Continue splitting the rest of the string
    SELECT
        row_num,
        company,
        valuation_b,
        date_joined,
        country,
        city,
        industry,
        year_joined,
        month_joined,
        TRIM(SUBSTRING(rest FROM 1 FOR POSITION(',' IN rest || ',') - 1)) AS investor,
        SUBSTRING(rest FROM POSITION(',' IN rest || ',') + 1) AS rest
    FROM split_investors
    WHERE rest <> ''
)

SELECT
    company,
    valuation_b,
    date_joined,
    country,
    city,
    industry,
    year_joined,
    month_joined,
    investor
FROM
    split_investors
ORDER BY
    company, date_joined, investor;


 * sqlite://
Done.


[]

In [None]:
%%sql
SELECT * FROM unicorn_investors LIMIT 5;

 * sqlite://
Done.


company,valuation_b,date_joined,country,city,industry,select_investors,year_joined,month_joined,investor
ByteDance,225.0,2017-04-07,China,Beijing,Media & Entertainment,"Sequoia Capital China, SIG Asia Investments, Sina Weibo, SoftBank Group",2017,4,SIG Asia Investments
ByteDance,225.0,2017-04-07,China,Beijing,Media & Entertainment,"Sequoia Capital China, SIG Asia Investments, Sina Weibo, SoftBank Group",2017,4,Sequoia Capital China
ByteDance,225.0,2017-04-07,China,Beijing,Media & Entertainment,"Sequoia Capital China, SIG Asia Investments, Sina Weibo, SoftBank Group",2017,4,Sina Weibo
ByteDance,225.0,2017-04-07,China,Beijing,Media & Entertainment,"Sequoia Capital China, SIG Asia Investments, Sina Weibo, SoftBank Group",2017,4,SoftBank Group
SpaceX,200.0,2012-12-01,United States,Hawthorne,Industrials,"Founders Fund, Draper Fisher Jurvetson, Rothenberg Ventures",2012,12,Draper Fisher Jurvetson


In [None]:
%%sql
SELECT
investor,
COUNT(investor) AS investor_count,
ROUND(SUM(valuation_b),2) AS total_valuation,
ROUND(AVG(valuation_b),2) AS avg_valuation
FROM unicorn_investors
GROUP BY 1
ORDER BY COUNT(select_investors) DESC
LIMIT 10;

 * sqlite://
Done.


investor,investor_count,total_valuation,avg_valuation
Andreessen Horowitz,72,307.61,4.27
Accel,65,218.83,3.37
Sequoia Capital,55,196.95,3.58
Tiger Global Management,53,137.82,2.6
Insight Partners,50,134.94,2.7
General Catalyst,42,123.03,2.93
Lightspeed Venture Partners,40,120.73,3.02
Sequoia Capital China,39,380.0,9.74
Index Ventures,36,156.6,4.35
SoftBank Group,33,369.05,11.18


#### Key Findings
- Correlation between the number of investor and valuation is 0.0402 meaing that there is no significant relationship between the two variables.
- Investor with the most investment made is Andressen Horowitz with 72 investment on unicorns.
- Investor with the highest total valuation of a company is Sequoia Capital China with 380 billion USD

# Geographical Analysis


In [None]:
%%sql
SELECT
continent,
industry,
COUNT(company) AS company_count,
SUM(investor_count) AS investor_count,
ROUND(SUM(valuation_b),2) AS total_valuation,
ROUND(AVG(valuation_b),2) AS avg_valuation
FROM unicorn_table_f
GROUP BY 1,2
ORDER BY continent;

 * sqlite://
Done.


continent,industry,company_count,investor_count,total_valuation,avg_valuation
Africa,Consumer & Retail,1,2,1.59,1.59
Africa,Financial Services,4,12,14.7,3.68
Asia,Consumer & Retail,86,242,235.22,2.74
Asia,Enterprise Tech,57,163,113.94,2.0
Asia,Financial Services,36,103,85.85,2.38
Asia,Healthcare & Life Sciences,16,39,32.93,2.06
Asia,Industrials,74,195,180.41,2.44
Asia,Insurance,2,6,3.1,1.55
Asia,Media & Entertainment,28,82,303.37,10.83
Europe,Consumer & Retail,31,82,60.23,1.94


#### Key Findings
- Asia has the highest number of
-
-

## Data Visualization
---

## Model Training
---

In [1]:
# Import necessary libraries
import polars as pl
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

import os

In [2]:
def load_data(file_path) -> pl.DataFrame:
  """
  Load data from a CSV file into a Polars DataFrame.
  """
  # Check if the file exists before reading it
  if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found at {file_path}")

  return pl.read_csv(file_path)

In [3]:
# Define the condition for the new column
file_path = "data\\transformed_data\\cleaned_data.csv"
df = load_data(file_path)

df = df.with_columns([
  (pl.col('valuation_($b)') > 5).cast(pl.Int8).alias('valuation_target')
])


FileNotFoundError: File not found at data\transformed_data\cleaned_data.csv

In [None]:
# Initialize the LabelEncoder
le_country = LabelEncoder()
le_city = LabelEncoder()
le_industry = LabelEncoder()

# Fit and transform the columns
df = df.with_columns([
  pl.Series('country_encoded', le_country.fit_transform(df['country'].to_numpy())),
  pl.Series('city_encoded', le_city.fit_transform(df['city'].to_numpy())),
  pl.Series('industry_encoded', le_industry.fit_transform(df['industry'].to_numpy()))
])

# %%
# Dropping specific columns from the cleaned_df DataFrame
columns_to_drop = ['date_joined', 'country', 'city', 'industry', 'select_investors', 'year_joined', 'month_joined', 'valuation_($b)']
df = df.drop(columns_to_drop)

# Display the updated DataFrame
df.head()

# %%
KF = KFold(n_splits = 5)

# %%
y = df['valuation_target']
X = df.drop('valuation_target', 'company')

# %%
gnb = GaussianNB()
rfm = RandomForestClassifier()

# Define parameter grid for RandomForestClassifier
param_grid = {
    "n_estimators": [50, 100, 150],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10]
}

# Initialize KFold cross-validator
kf = KFold(n_splits=5)

# Initialize GridSearchCV with cross-validation
rfm = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rfm, param_grid=param_grid, cv=kf, scoring='accuracy')

# Fit GridSearchCV to the data
grid_search.fit(X, y)

# Get the best parameters and best cross-validation score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)