<a href="https://colab.research.google.com/github/NaimurRahmanR/StartUpSuccessPredictor/blob/main/Naimur_Dissertation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load each dataset
datasets = {
    "acquisitions": "/content/acquisitions.csv",
    "degrees": "/content/degrees.csv",
    "funding_rounds": "/content/funding_rounds.csv",
    "funds": "/content/funds.csv",
    "investments": "/content/investments.csv",
    "ipos": "/content/ipos.csv",
    "milestones": "/content/milestones.csv",
    "objects": "/content/objects.csv",
    "offices": "/content/offices.csv",
    "people": "/content/people.csv"
}

# Reading and storing the dataframes
dataframes = {name: pd.read_csv(path) for name, path in datasets.items()}

# Displaying basic information about each dataframe
for name, df in dataframes.items():
    print(f"--- {name.upper()} DATAFRAME ---")
    df.info()
    print("\n")


In [None]:
from collections import defaultdict

# Assuming 'dataframes' is your dictionary of datasets

# Create a dictionary to store the occurrence of each column in different dataframes
column_occurrences = defaultdict(set)

# Iterate over each dataset and its columns
for name, df in dataframes.items():
    for column in df.columns:
        column_occurrences[column].add(name)

# Find and display common columns
common_columns = {column: datasets for column, datasets in column_occurrences.items() if len(datasets) > 1}
common_columns


In [None]:
# Potential column names that could represent the startup name
name_columns = ['name', 'startup_name', 'company_name']

# Check which datasets contain these columns
datasets_with_name_column = {}
for name, df in dataframes.items():
    for col in name_columns:
        if col in df.columns:
            datasets_with_name_column[name] = col
            break  # Found a suitable column, no need to check further

# Display which datasets can be merged based on the startup name
print("Datasets that can be merged based on the startup name:")
for dataset_name, column_name in datasets_with_name_column.items():
    print(f"- {dataset_name} (column: {column_name})")


In [None]:
import pandas as pd

# Assuming 'dataframes' contains your datasets

# Standardize the 'name' column (e.g., convert to lowercase for consistency)
dataframes['objects']['name'] = dataframes['objects']['name'].str.lower()
dataframes['funds']['name'] = dataframes['funds']['name'].str.lower()

# Merge the datasets on the 'name' column
merged_df = dataframes['objects'].merge(dataframes['funds'], on='name', how='left', suffixes=('', '_funds'))

# Check the first few rows of the merged dataframe
print(merged_df.head())

# Save the unified dataset to a CSV file
merged_df.to_csv('/content/funds_object.csv', index=False)


In [None]:
import pandas as pd

# Load the merged 'funds_object.csv' dataset
funds_object_df = pd.read_csv('/content/funds_object.csv')

# Assuming 'dataframes' is your dictionary of the other datasets
# Let's add funds_object_df to this dictionary for ease of comparison
dataframes['funds_object'] = funds_object_df

# Function to find common columns between two dataframes
def find_common_columns(df1, df2):
    return set(df1.columns).intersection(df2.columns)

# Compare the columns of 'funds_object' with each of the other datasets
common_columns = {}
for name, df in dataframes.items():
    if name != 'funds_object':
        common_cols = find_common_columns(funds_object_df, df)
        if common_cols:
            common_columns[name] = common_cols

# Display the common columns found
print("Common columns with 'funds_object' dataset:")
for dataset_name, columns in common_columns.items():
    print(f"- {dataset_name}: {', '.join(columns)}")


In [None]:
# Assuming 'dataframes' is your dictionary of datasets

# Check for 'object_id' in each dataset
object_id_presence = {}
for name, df in dataframes.items():
    object_id_presence[name] = 'object_id' in df.columns

# Display the presence of 'object_id' in each dataset
for dataset, has_object_id in object_id_presence.items():
    print(f"{dataset}: {'Yes' if has_object_id else 'No'}")


In [None]:
import pandas as pd

# Assuming 'dataframes' is your dictionary of datasets
# Assuming 'funds_object' dataset is already in 'dataframes' dictionary

# Primary dataset (choose one with the most comprehensive 'object_id' coverage, e.g., 'funds_object')
primary_df = dataframes['funds_object']

# List of datasets to merge (which contain 'object_id')
datasets_to_merge = ['degrees', 'funding_rounds', 'funds', 'ipos', 'milestones', 'offices', 'people']

# Merge datasets with 'object_id'
for dataset_name in datasets_to_merge:
    if 'object_id' in dataframes[dataset_name].columns:
        primary_df = primary_df.merge(dataframes[dataset_name], on='object_id', how='left', suffixes=('', f'_{dataset_name}'))

# Check the first few rows of the merged dataframe
print(primary_df.head())

# Save the unified dataset to a CSV file
primary_df.to_csv('/content/eight_in_one.csv', index=False)


In [None]:
# @title
# Load the merged 'eight_in_one' dataset
eight_in_one_df = pd.read_csv('/content/eight_in_one.csv')

# Update the dataframes dictionary
dataframes['eight_in_one'] = eight_in_one_df

# Function to find common columns between two dataframes
def find_common_columns(df1, df2):
    return set(df1.columns).intersection(df2.columns)

# Identify common columns between 'eight_in_one' and 'acquisitions', 'investments'
common_columns_acquisitions = find_common_columns(eight_in_one_df, dataframes['acquisitions'])
common_columns_investments = find_common_columns(eight_in_one_df, dataframes['investments'])

# Display the common columns
print("Common columns with 'acquisitions':", common_columns_acquisitions)
print("Common columns with 'investments':", common_columns_investments)


In [None]:
# Assuming 'dataframes' contains your dictionary of datasets
# Assuming 'primary_df' is the final merged dataset

# List of datasets to merge (which contain 'object_id')
datasets_to_merge = ['degrees', 'funding_rounds', 'funds', 'ipos', 'milestones', 'offices', 'people']

# Check which datasets have not been merged
datasets_not_merged = [dataset for dataset in dataframes.keys() if dataset not in datasets_to_merge]

# If 'funds_object' is in the list, remove it because it's the primary dataset
if 'funds_object' in datasets_not_merged:
    datasets_not_merged.remove('funds_object')

# Print the datasets that have not been merged
print("Datasets that have not been merged:")
for dataset in datasets_not_merged:
    print(dataset)


In [None]:
import pandas as pd

# Load the merged 'eight_in_one.csv' dataset
eight_in_one_df = pd.read_csv('/content/eight_in_one.csv')

# Extract the column names containing suffixes
suffixes = [col.split('_')[1] for col in eight_in_one_df.columns if '_' in col]

# Deduplicate the suffixes list
unique_suffixes = list(set(suffixes))

# Identify the datasets merged based on their suffixes
merged_datasets = [dataset for dataset in datasets_to_merge if dataset in unique_suffixes]

# Print the datasets that were merged into 'eight_in_one.csv'
print("Datasets merged into 'eight_in_one.csv':")
for dataset in merged_datasets:
    print(dataset)


In [None]:
objects_df = pd.read_csv("/content/objects.csv", dtype={'funding_total_usd_objects': float})
eight_in_one_df = pd.read_csv("/content/eight_in_one.csv", dtype={'funding_total_usd_objects': float})


In [None]:
import pandas as pd

# Load the 'acquisitions,' 'investments,' and 'objects' datasets
acquisitions_df = pd.read_csv("/content/acquisitions.csv")
investments_df = pd.read_csv("/content/investments.csv")
objects_df = pd.read_csv("/content/objects.csv")

# Load the 'eight_in_one' dataset
eight_in_one_df = pd.read_csv("/content/eight_in_one.csv")

# Merge the datasets into one unified dataset using index
unified_df = eight_in_one_df.merge(acquisitions_df, left_index=True, right_index=True, how='left', suffixes=('', '_acquisitions'))
unified_df = unified_df.merge(investments_df, left_index=True, right_index=True, how='left', suffixes=('', '_investments'))
unified_df = unified_df.merge(objects_df, left_index=True, right_index=True, how='left', suffixes=('', '_objects'))

# Check the first few rows of the unified dataset
print(unified_df.head())

# Save the unified dataset to a CSV file
unified_df.to_csv('/content/unified_dataset.csv', index=False)


In [None]:
import pandas as pd

# Load the unified dataset
unified_df = pd.read_csv('/content/unified_dataset.csv')

# Get the number of rows and columns
num_rows, num_columns = unified_df.shape

# Get the column names
column_names = unified_df.columns.tolist()

# Get basic statistics about the dataset
dataset_info = unified_df.describe()

# Print the information
print(f"Number of Rows: {num_rows}")
print(f"Number of Columns: {num_columns}")
print("Column Names:")
for column in column_names:
    print(f"- {column}")

print("\nBasic Statistics:")
print(dataset_info)


In [None]:
import pandas as pd

# Load the unified dataset
unified_df = pd.read_csv('/content/unified_dataset.csv')

# Replace missing values in numeric columns with a specific value (e.g., 0)
numeric_columns = unified_df.select_dtypes(include=['number']).columns
unified_df[numeric_columns] = unified_df[numeric_columns].fillna(0)

# Replace missing values in categorical columns with a specific value (e.g., 'Unknown')
categorical_columns = unified_df.select_dtypes(exclude=['number']).columns
unified_df[categorical_columns] = unified_df[categorical_columns].fillna('Unknown')


#unified_df = unified_df.dropna()

# Save the cleaned dataset to a new CSV file
unified_df.to_csv('/content/cleaned_unified_dataset.csv', index=False)


In [None]:
!pip install xgboost
!pip install lightgbm
!pip install catboost

# Data Manipulation
import pandas as pd
import numpy as np

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Additional ML libraries
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# Set the style of the visualization
sns.set(style="whitegrid")


In [None]:
# Load the dataset
file_path = '/content/cleaned_unified_dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

# Get a concise summary of the dataframe
print("\nDataframe Information:")
print(data.info())

# Basic statistical details
print("\nBasic Statistical Details:")
print(data.describe())


In [None]:
# Finding rows with unique startup names
unique_startups = data.drop_duplicates(subset=['name'])

# Displaying the rows with unique startup names
print("Rows with Unique Startup Names:")
print(unique_startups)


In [None]:
# Display the first few rows of the dataset
print("First few rows of the unique startup dataset:")
print(unique_startups.head())

# Summarize the dataset
print("\nSummary of the unique startup dataset:")
print(unique_startups.info())

# Display basic statistical details for numerical columns
print("\nBasic Statistical Details:")
print(unique_startups.describe())


In [None]:
# Display the column names
print("Column Names:")
print(unique_startups.columns)

# Check for null values in each column
print("\nCount of Null Values in Each Column:")
print(unique_startups.isnull().sum())

# Check for duplicate rows
duplicate_rows = unique_startups.duplicated().sum()
print("\nNumber of Duplicate Rows:")
print(duplicate_rows)


In [None]:
# Counting the number of startups in the dataset
number_of_startups = unique_startups.shape[0]

print("Number of unique startups in the dataset:", number_of_startups)


In [None]:
# Function to find duplicate columns
def get_duplicate_columns(unique_startups):
    duplicate_columns = []
    for i in range(unique_startups.shape[1]):
        col1 = unique_startups.iloc[:, i]
        for j in range(i + 1, unique_startups.shape[1]):
            col2 = unique_startups.iloc[:, j]
            if col1.equals(col2):
                duplicate_columns.append(unique_startups.columns.values[j])
    return duplicate_columns

# Find and print duplicate columns
duplicates = get_duplicate_columns(unique_startups)
print("Duplicate Columns:", duplicates)


In [None]:
# List of duplicate columns - replace this list with the one you have found
duplicate_columns = ['description_offices', 'address1', 'address2', 'zip_code', 'state_code_offices', 'fund_id', 'raised_currency_code_funds', 'updated_at_funds', 'degree_type', 'subject', 'institution', 'graduated_at', 'created_at_degrees', 'updated_at_degrees', 'id_funding_rounds', 'funding_round_id', 'funded_at_funding_rounds', 'funding_round_type', 'funding_round_code', 'raised_amount_usd', 'raised_amount_funding_rounds', 'raised_currency_code_funding_rounds', 'pre_money_valuation_usd', 'pre_money_valuation', 'pre_money_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'subject', 'institution', 'graduated_at', 'created_at_degrees', 'updated_at_degrees', 'id_funding_rounds', 'funding_round_id', 'funded_at_funding_rounds', 'funding_round_type', 'funding_round_code', 'raised_amount_usd', 'raised_amount_funding_rounds', 'raised_currency_code_funding_rounds', 'pre_money_valuation_usd', 'pre_money_valuation', 'pre_money_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'institution', 'graduated_at', 'created_at_degrees', 'updated_at_degrees', 'id_funding_rounds', 'funding_round_id', 'funded_at_funding_rounds', 'funding_round_type', 'funding_round_code', 'raised_amount_usd', 'raised_amount_funding_rounds', 'raised_currency_code_funding_rounds', 'pre_money_valuation_usd', 'pre_money_valuation', 'pre_money_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'graduated_at', 'created_at_degrees', 'updated_at_degrees', 'id_funding_rounds', 'funding_round_id', 'funded_at_funding_rounds', 'funding_round_type', 'funding_round_code', 'raised_amount_usd', 'raised_amount_funding_rounds', 'raised_currency_code_funding_rounds', 'pre_money_valuation_usd', 'pre_money_valuation', 'pre_money_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'created_at_degrees', 'updated_at_degrees', 'id_funding_rounds', 'funding_round_id', 'funded_at_funding_rounds', 'funding_round_type', 'funding_round_code', 'raised_amount_usd', 'raised_amount_funding_rounds', 'raised_currency_code_funding_rounds', 'pre_money_valuation_usd', 'pre_money_valuation', 'pre_money_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'updated_at_degrees', 'id_funding_rounds', 'funding_round_id', 'funded_at_funding_rounds', 'funding_round_type', 'funding_round_code', 'raised_amount_usd', 'raised_amount_funding_rounds', 'raised_currency_code_funding_rounds', 'pre_money_valuation_usd', 'pre_money_valuation', 'pre_money_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'id_funding_rounds', 'funding_round_id', 'funded_at_funding_rounds', 'funding_round_type', 'funding_round_code', 'raised_amount_usd', 'raised_amount_funding_rounds', 'raised_currency_code_funding_rounds', 'pre_money_valuation_usd', 'pre_money_valuation', 'pre_money_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'funding_round_id', 'funded_at_funding_rounds', 'funding_round_type', 'funding_round_code', 'raised_amount_usd', 'raised_amount_funding_rounds', 'raised_currency_code_funding_rounds', 'pre_money_valuation_usd', 'pre_money_valuation', 'pre_money_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'funded_at_funding_rounds', 'funding_round_type', 'funding_round_code', 'raised_amount_usd', 'raised_amount_funding_rounds', 'raised_currency_code_funding_rounds', 'pre_money_valuation_usd', 'pre_money_valuation', 'pre_money_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'funding_round_type', 'funding_round_code', 'raised_amount_usd', 'raised_amount_funding_rounds', 'raised_currency_code_funding_rounds', 'pre_money_valuation_usd', 'pre_money_valuation', 'pre_money_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'funding_round_code', 'raised_amount_usd', 'raised_amount_funding_rounds', 'raised_currency_code_funding_rounds', 'pre_money_valuation_usd', 'pre_money_valuation', 'pre_money_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'raised_amount_usd', 'raised_amount_funding_rounds', 'raised_currency_code_funding_rounds', 'pre_money_valuation_usd', 'pre_money_valuation', 'pre_money_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'raised_amount_funding_rounds', 'raised_currency_code_funding_rounds', 'pre_money_valuation_usd', 'pre_money_valuation', 'pre_money_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'raised_currency_code_funding_rounds', 'pre_money_valuation_usd', 'pre_money_valuation', 'pre_money_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'pre_money_valuation_usd', 'pre_money_valuation', 'pre_money_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'pre_money_valuation', 'pre_money_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'pre_money_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'post_money_valuation', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'participants', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'is_first_round', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'is_last_round', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'source_url_funding_rounds', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'source_description_funding_rounds', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'created_by_funding_rounds', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'created_at_funding_rounds', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'updated_at_funding_rounds', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'valuation_amount', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'fund_id_funds', 'updated_at_funds.1', 'raised_amount_ipos', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'public_at', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'source_url_ipos', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'source_description_ipos', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'latitude', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'address1', 'address2', 'zip_code', 'state_code_offices', 'city_offices', 'address2', 'zip_code', 'state_code_offices', 'zip_code', 'state_code_offices', 'state_code_offices', 'longitude', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'created_at_offices', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'updated_at_offices', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'id_people', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'first_name', 'last_name', 'birthplace', 'affiliation_name', 'last_name', 'birthplace', 'affiliation_name', 'birthplace', 'affiliation_name', 'affiliation_name']  # and so on...

# Drop duplicate columns
df = unique_startups.drop(columns=duplicate_columns)

# Save the cleaned dataset
df.to_csv('/content/unique_startups.csv', index=False)

# Optional: Display basic information about the cleaned dataframe
print(df.info())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset from Google Colab path
df = pd.read_csv('/content/unique_startups.csv')

# 3.3.1 Descriptive Statistics
print("Descriptive Statistics:\n", df.describe())
print("\nFrequency of Categorical Variables:\n", df.select_dtypes(include=['object']).apply(pd.Series.value_counts))

# 3.3.2 Visualization of Distributions
# Continuous Variables
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    df[col].hist()
    plt.title(f'Histogram of {col}')

    plt.subplot(1, 2, 2)
    df.boxplot(column=[col])
    plt.title(f'Box Plot of {col}')
    plt.show()

# Categorical Variables
for col in df.select_dtypes(include=['object']).columns:
    plt.figure(figsize=(10, 4))
    sns.countplot(y=col, data=df)
    plt.title(f'Bar Chart of {col}')
    plt.show()

# 3.3.3 Correlation Analysis
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, cmap='viridis')
plt.title('Correlation Heatmap')
plt.show()

#  Cross-Variable Analysis
# Scatter plots for continuous variables
continuous_vars = df.select_dtypes(include=['int64', 'float64']).columns
for i in range(len(continuous_vars)):
    for j in range(i+1, len(continuous_vars)):
        plt.scatter(df[continuous_vars[i]], df[continuous_vars[j]])
        plt.title(f'Scatter Plot of {continuous_vars[i]} vs {continuous_vars[j]}')
        plt.xlabel(continuous_vars[i])
        plt.ylabel(continuous_vars[j])
        plt.show()



# Anomaly Detection


target_column = 'funding_total_usd' if 'funding_total_usd' in df.columns else df.columns[0]
Q1 = df[target_column].quantile(0.25)
Q3 = df[target_column].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df[target_column] < (Q1 - 1.5 * IQR)) | (df[target_column] > (Q3 + 1.5 * IQR))]
print("Outliers in the dataset:\n", outliers)


In [None]:

# Identifying numerical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
print("Numerical Columns:")
print(numerical_columns)

# Identifying categorical columns
# 'object' often indicates categorical data, but it can also include strings and mixed types
categorical_columns = df.select_dtypes(include=['object']).columns
print("\nCategorical Columns:")
print(categorical_columns)


In [None]:
# Define the success criteria
success_criteria = ['acquired', 'ipo']

# Filter the DataFrame for successful startups
successful_startups = df[df['status'].isin(success_criteria)]

# Display the successful startups
print("Successful Startups:")
print(successful_startups)


In [None]:
# Assuming 'df' is your DataFrame and 'status' is the column indicating success
# Define success criteria
success_criteria = ['acquired', 'ipo']

# Count successful startups
successful_startup_count = df[df['status'].isin(success_criteria)].shape[0]

# Count other startups
other_startup_count = df[~df['status'].isin(success_criteria)].shape[0]

print(f"Number of successful startups: {successful_startup_count}")
print(f"Number of other startups: {other_startup_count}")


In [None]:
# List of potential features
potential_features = ['funding_total_usd', 'funding_rounds', 'founded_at', 'category_code', 'country_code', 'city', 'milestones', 'relationships']

# Check if these features exist in your dataframe
available_features = [feature for feature in potential_features if feature in df.columns]

print("Available Features for the Model:", available_features)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset from Google Colab path
df = pd.read_csv('/content/unique_startups.csv')

# 3.3.1 Descriptive Statistics
print("Descriptive Statistics:\n", df.describe())
print("\nFrequency of Categorical Variables:\n", df.select_dtypes(include=['object']).apply(pd.Series.value_counts))

# 3.3.2 Visualization of Distributions
# Continuous Variables
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    df[col].hist()
    plt.title(f'Histogram of {col}')

    plt.subplot(1, 2, 2)
    df.boxplot(column=[col])
    plt.title(f'Box Plot of {col}')
    plt.show()

# Categorical Variables
for col in df.select_dtypes(include=['object']).columns:
    plt.figure(figsize=(10, 4))
    sns.countplot(y=col, data=df)
    plt.title(f'Bar Chart of {col}')
    plt.show()

# Correlation Analysis
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, cmap='viridis')
plt.title('Correlation Heatmap')
plt.show()

#  Cross-Variable Analysis
# Scatter plots for continuous variables
continuous_vars = df.select_dtypes(include=['int64', 'float64']).columns
for i in range(len(continuous_vars)):
    for j in range(i+1, len(continuous_vars)):
        plt.scatter(df[continuous_vars[i]], df[continuous_vars[j]])
        plt.title(f'Scatter Plot of {continuous_vars[i]} vs {continuous_vars[j]}')
        plt.xlabel(continuous_vars[i])
        plt.ylabel(continuous_vars[j])
        plt.show()

#  Temporal Analysis
# This section is dependent on whether your dataset contains date/time columns.
# Adjust the code as needed based on your dataset's structure.

# Anomaly Detection

target_column = 'funding_total_usd' if 'funding_total_usd' in df.columns else df.columns[0]  # Replace with your actual column name
Q1 = df[target_column].quantile(0.25)
Q3 = df[target_column].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df[target_column] < (Q1 - 1.5 * IQR)) | (df[target_column] > (Q3 + 1.5 * IQR))]
print("Outliers in the dataset:\n", outliers)


In [None]:
import pandas as pd
import scipy.stats as stats

# Load your dataset
# Replace this with the path to your dataset
data = pd.read_csv('/content/unique_startups.csv')

# Select the columns for ANOVA
anova_data = data[['category_code', 'funding_total_usd']]

# Remove rows with missing values for these columns
anova_data = anova_data.dropna()

# Assuming 'category_code' is the categorical variable and we want to compare 'funding_total_usd' across categories
groups = anova_data.groupby('category_code')['funding_total_usd'].apply(list)

# Perform the ANOVA test
# *groups.values unpacks the groups into separate arguments for the f_oneway function
f_statistic, p_value = stats.f_oneway(*groups.values)

print(f"F-Statistic: {f_statistic}, P-Value: {p_value}")


In [None]:
import pandas as pd
import scipy.stats as stats

# Load the dataset
file_path = '/content/unique_startups.csv'
data = pd.read_csv(file_path)

# List of potential features for the model
potential_features = ['funding_total_usd', 'funding_rounds', 'founded_at', 'category_code', 'country_code', 'city', 'milestones', 'relationships']

# Define the success criteria
success_criteria = ['acquired', 'ipo']

# Filter the DataFrame for successful startups
successful_startups = data[data['status'].isin(success_criteria)]

# Identifying numerical and categorical columns among the selected features
numerical_columns = successful_startups.select_dtypes(include=['int64', 'float64']).columns.intersection(potential_features)
categorical_columns = successful_startups.select_dtypes(include=['object']).columns.intersection(potential_features)

# Dictionary to store ANOVA results
anova_results = {}

# Perform ANOVA for each pair of categorical and numerical columns among selected features
for num_col in numerical_columns:
    for cat_col in categorical_columns:
        # Grouping data
        groups = successful_startups.groupby(cat_col)[num_col].apply(list)

        # Check if there are at least two groups with more than one observation
        valid_groups = [group for group in groups if len(group) > 1]
        if len(valid_groups) > 1:
            f_statistic, p_value = stats.f_oneway(*valid_groups)
            anova_results[(cat_col, num_col)] = (f_statistic, p_value)

# Print the ANOVA results
for pair, result in anova_results.items():
    cat_col, num_col = pair
    f_statistic, p_value = result
    print(f"ANOVA for {cat_col} and {num_col}: F-Statistic = {f_statistic}, P-Value = {p_value}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# Sample data - replace this with your actual DataFrame
# Assuming you have a DataFrame 'df' with these columns
df = pd.DataFrame({
    'funding_rounds': np.random.randint(1, 10, 100),
    'relationships': np.random.randint(1, 20, 100),
    'funding_total_usd': np.random.uniform(1e6, 1e9, 100),
    'success_probability': np.random.uniform(0, 1, 100)
})

# Calculating average success probability for each 'funding_rounds'
avg_success_by_funding_rounds = df.groupby('funding_rounds')['success_probability'].mean().reset_index()

# Plotting
plt.figure(figsize=(14, 6))

# Bar plot for average success probability
sns.barplot(x='funding_rounds', y='success_probability', data=avg_success_by_funding_rounds, palette='viridis')

# Density plot overlaid for distribution
sns.kdeplot(df['funding_rounds'], bw_adjust=0.5, color='red', fill=True, alpha=0.3)

plt.title('Average Success Probability by Funding Rounds with Distribution Overlay')
plt.xlabel('Funding Rounds')
plt.ylabel('Average Success Probability')
plt.grid(True)
plt.show()


In [None]:
# Handle missing values for numeric columns only
df.fillna(df.mean(numeric_only=True), inplace=True)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Features identified for the model
selected_features = ['funding_total_usd', 'funding_rounds', 'founded_at', 'category_code', 'country_code', 'city', 'milestones', 'relationships']

# Assuming 'status' is the column indicating a successful startup
data['is_successful'] = data['status'].apply(lambda x: 1 if x in ['acquired', 'ipo'] else 0)

# Encoding categorical variables and handling missing values
label_encoders = {}
for column in data[selected_features].select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column].astype(str))

# Handle missing values - here we fill them with the mean. Consider a more appropriate strategy for your dataset.
#data.fillna(data.mean(), inplace=True)

# Splitting the dataset
X = data[selected_features]
y = data['is_successful']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Training the Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluating the model
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

# Identifying top 5 most important features
feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False)
print("Top 5 Important Features:")
print(feature_importances.head(5))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming feature_importances is your DataFrame containing the feature importances
feature_importances = pd.DataFrame({'feature': ['city', 'founded_at', 'relationships', 'category_code', 'funding_total_usd'],
                                    'importance': [0.240962, 0.191549, 0.159871, 0.153828, 0.077119]})

# Sort the DataFrame by importance
feature_importances = feature_importances.sort_values('importance', ascending=False)

# Create a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importances)
plt.title('Top 5 Important Features for Predicting Startup Success')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()


In [None]:
import pandas as pd

# Replace this with your actual file path
file_path = '/content/unique_startups.csv'

# Load the CSV file into a DataFrame
try:
    df = pd.read_csv(file_path)
    print("File loaded successfully.")
except FileNotFoundError:
    print("File not found. Please check the file path.")
except Exception as e:
    print(f"An error occurred: {e}")


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import RFE
import pandas as pd

# Assuming 'df' is your DataFrame
selected_features = ['funding_total_usd', 'funding_rounds', 'founded_at', 'category_code', 'country_code', 'city', 'milestones', 'relationships']

# Encoding categorical variables and handling missing values
label_encoders = {}
for column in df[selected_features].select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column].astype(str))

# Handle missing values
df.fillna(df.mean(numeric_only=True), inplace=True)

# Define the target variable
df['is_successful'] = df['status'].apply(lambda x: 1 if x in ['acquired', 'ipo'] else 0)

# Feature Scaling
scaler = StandardScaler()
df[selected_features] = scaler.fit_transform(df[selected_features])

# Splitting the dataset
X = df[selected_features]
y = df['is_successful']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Training the SVM Classifier
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Evaluating the model
predictions = svm_model.predict(X_test)
print(classification_report(y_test, predictions))

# Applying RFE for feature selection
selector = RFE(svm_model, n_features_to_select=5, step=1)
selector = selector.fit(X_train, y_train)

# Identifying top features
top_features = pd.Series(selector.support_, index=X.columns)
print("Top Features according to RFE:")
print(top_features[top_features == True].index.tolist())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming 'top_features' is a boolean series from RFE indicating selected features
# Replace this with your actual RFE results
top_features = pd.Series([True, False, True, True, False, True, True, False],
                         index=['funding_total_usd', 'funding_rounds', 'founded_at', 'category_code', 'country_code', 'city', 'milestones', 'relationships'])

# Create a DataFrame for visualization
features_df = pd.DataFrame({'Feature': top_features.index, 'Selected': top_features.values})

# Create a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Selected', y='Feature', data=features_df)
plt.title('Top Features Selected by RFE with SVM Classifier')
plt.xlabel('Selected')
plt.ylabel('Features')
plt.show()


In [None]:
from sklearn.linear_model import LogisticRegression

# Train the Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Coefficients as feature importance
log_reg_importance = pd.DataFrame({'feature': X_train.columns, 'importance': abs(log_reg.coef_[0])})
log_reg_importance.sort_values(by='importance', ascending=False, inplace=True)
print("Logistic Regression Feature Importance:\n", log_reg_importance)


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Train the Gradient Boosting model
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)

# Feature importance
gb_importance = pd.DataFrame({'feature': X_train.columns, 'importance': gb_clf.feature_importances_})
gb_importance.sort_values(by='importance', ascending=False, inplace=True)
print("Gradient Boosting Feature Importance:\n", gb_importance)


In [None]:
import xgboost as xgb

# Train the XGBoost model
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train, y_train)

# Feature importance
xgb_importance = pd.DataFrame({'feature': X_train.columns, 'importance': xgb_clf.feature_importances_})
xgb_importance.sort_values(by='importance', ascending=False, inplace=True)
print("XGBoost Feature Importance:\n", xgb_importance)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have dataframes 'log_reg_importance', 'gb_importance', and 'xgb_importance' from the respective models

# Function to create a bar plot for feature importance
def plot_feature_importance(importance_df, title):
    plt.figure(figsize=(10, 6))
    sns.barplot(x='importance', y='feature', data=importance_df)
    plt.title(title)
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.show()

# Plot for Logistic Regression
plot_feature_importance(log_reg_importance, 'Logistic Regression Feature Importance')

# Plot for Gradient Boosting
plot_feature_importance(gb_importance, 'Gradient Boosting Feature Importance')

# Plot for XGBoost
plot_feature_importance(xgb_importance, 'XGBoost Feature Importance')


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

# Create dataframes for each model's feature importance (assuming these are your results)
rf_importance = pd.DataFrame({'feature': ['city', 'founded_at', 'relationships', 'category_code', 'funding_total_usd'], 'importance': [0.240962, 0.191549, 0.159871, 0.153828, 0.077119], 'model': 'Random Forest'})
log_reg_importance = pd.DataFrame({'feature': ['relationships', 'milestones', 'country_code', 'founded_at', 'category_code'], 'importance': [0.459277, 0.205762, 0.203600, 0.091770, 0.090510], 'model': 'Logistic Regression'})
gb_importance = pd.DataFrame({'feature': ['founded_at', 'relationships', 'milestones', 'category_code', 'funding_total_usd'], 'importance': [0.292110, 0.247738, 0.162949, 0.109068, 0.089996], 'model': 'Gradient Boosting'})
xgb_importance = pd.DataFrame({'feature': ['milestones', 'founded_at', 'relationships', 'category_code', 'country_code'], 'importance': [0.178565, 0.156499, 0.154944, 0.127769, 0.115070], 'model': 'XGBoost'})
svm_rfe_importance = pd.DataFrame({'feature': ['funding_total_usd', 'funding_rounds', 'category_code', 'milestones', 'relationships'], 'importance': [1, 1, 1, 1, 1], 'model': 'SVM with RFE'})  # Equal importance to selected features

# Normalize the importance scores
scaler = MinMaxScaler()
combined = pd.concat([rf_importance, log_reg_importance, gb_importance, xgb_importance, svm_rfe_importance])
combined['importance_normalized'] = scaler.fit_transform(combined[['importance']])

# Aggregate the scores and identify top 3 features
aggregate_importance = combined.groupby('feature')['importance_normalized'].mean().reset_index()
top_features = aggregate_importance.sort_values('importance_normalized', ascending=False).head(3)

# Visualization
plt.figure(figsize=(10, 6))
sns.barplot(x='importance_normalized', y='feature', data=top_features)
plt.title('Top 3 Important Features Across Models')
plt.xlabel('Normalized Aggregate Importance')
plt.ylabel('Feature')
plt.show()


In [None]:

top_features = ['funding_rounds', 'relationships', 'funding_total_usd']


In [None]:
X_train_top = X_train[top_features]
X_test_top = X_test[top_features]


In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Initialize models
rf_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(kernel='linear', random_state=42)
log_reg = LogisticRegression(random_state=42)
gb_clf = GradientBoostingClassifier(random_state=42)
xgb_clf = xgb.XGBClassifier(random_state=42)

# Dictionary to store predictions
predictions = {}

# Train and predict with each model
for clf, name in zip([rf_clf, svm_clf, log_reg, gb_clf, xgb_clf],
                     ['Random Forest', 'SVM', 'Logistic Regression', 'Gradient Boosting', 'XGBoost']):
    clf.fit(X_train_top, y_train)
    preds = clf.predict(X_test_top)
    predictions[name] = preds
    accuracy = accuracy_score(y_test, preds)
    print(f"{name} Accuracy with top features: {accuracy:.4f}")

# Ensemble predictions (majority voting)
ensemble_preds = pd.DataFrame(predictions).mode(axis=1)[0]
ensemble_accuracy = accuracy_score(y_test, ensemble_preds)
print(f"Ensemble Accuracy with top features: {ensemble_accuracy:.4f}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Accuracy scores from your output
accuracies = {
    'Random Forest': 0.9611,
    'SVM': 0.9648,
    'Logistic Regression': 0.9647,
    'Gradient Boosting': 0.9647,
    'XGBoost': 0.9637,
    'Ensemble': 0.9647
}

# Convert to DataFrame for plotting
accuracy_df = pd.DataFrame(list(accuracies.items()), columns=['Model', 'Accuracy'])

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Accuracy', y='Model', data=accuracy_df.sort_values('Accuracy', ascending=False), palette='viridis')
plt.title('Model Accuracies with Top Features')
plt.xlabel('Accuracy Score')
plt.ylabel('Model')
plt.xlim(0.92, 0.98)  # Adjust the x-axis limits to zoom in on differences
plt.show()


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Define a set of hyperparameters for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2],
    'subsample': [0.8, 0.9, 1.0]
}

# Initialize the Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(random_state=42)

# Initialize the GridSearchCV object
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=gb_clf, param_grid=param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)

# Fit the GridSearchCV object to the data to compute the best model
grid_search.fit(X_train_top, y_train)

# Print the best hyperparameters
print("Best hyperparameters:\n", grid_search.best_params_)

# Use the best estimator for making predictions
best_gb_clf = grid_search.best_estimator_

# Predictions with the tuned model
tuned_predictions = best_gb_clf.predict(X_test_top)

# Evaluate the tuned model
print("Tuned Model Accuracy:", accuracy_score(y_test, tuned_predictions))


In [None]:

df['is_successful'] = df['status'].apply(lambda x: 1 if x in ['acquired', 'ipo'] else 0)


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Top features identified previously
top_features = ['funding_rounds', 'relationships', 'funding_total_usd']

# Splitting the dataset using only the top features
X = df[top_features]
y = df['is_successful']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define a set of hyperparameters for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2],
    'subsample': [0.8, 0.9, 1.0]
}

# Initialize the Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(random_state=42)

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=gb_clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best hyperparameters:\n", grid_search.best_params_)

# Use the best estimator for making predictions
best_gb_clf = grid_search.best_estimator_

# Predictions with the best model
y_pred = best_gb_clf.predict(X_test)

# Evaluate the model
print("Validation Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
# Assuming 'top_features' is the target variable
y_train = le.fit_transform(y_train)  # Fit and transform on training data
y_test = le.transform(y_test)        # Only transform on testing data


In [None]:
# Print out the data types of the columns in your dataframe
print(X_train.dtypes)


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Apply label encoding to each object-type column except the target column
for column in df.columns:
    if df[column].dtype == 'object' and column != 'is_successful':
        # Fill missing values with a placeholder
        df[column] = df[column].fillna('Missing')
        df[column] = le.fit_transform(df[column])


In [None]:
# Fill missing values in numeric columns with the median
for column in df.columns:
    if df[column].dtype in ['int64', 'float64']:
        df[column] = df[column].fillna(df[column].median())


In [None]:
X = df.drop('is_successful', axis=1)  # Feature matrix
y = df['is_successful']               # Target vector

# If 'is_successful' is categorical, apply label encoding
if y.dtype == 'object':
    y = le.fit_transform(y)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# Print out the data types of the columns in X_train
print(X_train.dtypes)


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for column in df.select_dtypes(include=['object']).columns:
    if column not in ['created_at_objects', 'updated_at_objects']:  # Exclude date columns for now
        df[column] = df[column].astype(str)  # Convert to string
        df[column] = le.fit_transform(df[column])


In [None]:
df['created_at_objects'] = pd.to_datetime(df['created_at_objects'], errors='coerce')
df['updated_at_objects'] = pd.to_datetime(df['updated_at_objects'], errors='coerce')

df['created_at_year'] = df['created_at_objects'].dt.year
df['created_at_month'] = df['created_at_objects'].dt.month
df['created_at_day'] = df['created_at_objects'].dt.day

df['updated_at_year'] = df['updated_at_objects'].dt.year
df['updated_at_month'] = df['updated_at_objects'].dt.month
df['updated_at_day'] = df['updated_at_objects'].dt.day

# Drop the original date columns
df.drop(['created_at_objects', 'updated_at_objects'], axis=1, inplace=True)


In [None]:
# Fill missing values
df.fillna(df.median(), inplace=True)


In [None]:
X = df.drop('is_successful', axis=1)  # Replace 'is_successful' with your target column
y = df['is_successful']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb

# Load your dataset
df = pd.read_csv('/content/unique_startups.csv')

df['is_successful'] = df['status'].apply(lambda x: 1 if x in ['acquired', 'ipo'] else 0)

# Assume 'target' is the column name of your target variable
X = df.drop('is_successful', axis=1)  # Feature matrix
y = df['is_successful']               # Target variable

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model refinement with GridSearchCV for SVM
parameters_svm = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = SVC()
clf_svm = GridSearchCV(svc, parameters_svm)
clf_svm.fit(X_train, y_train)

# Model refinement for Logistic Regression
parameters_lr = {'C': [0.001, 0.01, 0.1, 1, 10]}
logreg = LogisticRegression()
clf_lr = GridSearchCV(logreg, parameters_lr)
clf_lr.fit(X_train, y_train)

# Ensemble model (e.g., Random Forest)
parameters_rf = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
rf = RandomForestClassifier()
clf_rf = GridSearchCV(rf, parameters_rf)
clf_rf.fit(X_train, y_train)

# Model validation for each model
for model, name in zip([clf_svm, clf_lr, clf_rf], ['SVM', 'Logistic Regression', 'Random Forest']):
    predictions = model.predict(X_test)
    print(f"--- {name} ---")
    print("Accuracy:", accuracy_score(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("\n")


In [None]:
import pandas as pd

# Suppress SettingWithCopyWarning
pd.options.mode.chained_assignment = None  # default='warn'


In [None]:
import pandas as pd

# Load new data from a CSV file
# Replace 'path_to_new_data.csv' with the actual file path
new_data = pd.read_csv('/content/startup data.csv')

# Now, continue with the steps to prepare the data for prediction
top_features = ['funding_rounds', 'relationships', 'funding_total_usd']
new_data_top_features = new_data[top_features]
new_data_top_features.fillna(new_data_top_features.mean(), inplace=True)

# If you used any encoders (like LabelEncoder) during training, apply them to the relevant columns
# e.g., new_data_top_features['category'] = label_encoder.transform(new_data_top_features['category'])

# Predicting with the trained model (best_gb_clf in this case)
predictions = best_gb_clf.predict(new_data_top_features)
# Print the predictions
print(predictions)



In [None]:
# Get the probabilities of being a successful startup
success_probabilities = best_gb_clf.predict_proba(new_data_top_features)[:, 1]

# Add probabilities to the new_data DataFrame for reference
new_data['success_probability'] = success_probabilities

# Sort the DataFrame by success_probability in descending order
top_startups = new_data.sort_values(by='success_probability', ascending=False)

# Select the top ten startups
top_ten_startups = top_startups.head(10)

# Display the top ten startups
print(top_ten_startups)


In [None]:
# Assuming your DataFrame is named 'top_ten_startups'
# Extracting the names of the top ten startups
top_ten_startup_names = top_ten_startups['name'].tolist()

# Print the names of the top ten startups
for i, name in enumerate(top_ten_startup_names, start=1):
    print(f"{i}. {name}")


In [None]:
# Assuming 'top_ten_startups' is your DataFrame
# Extracting success probabilities for the top ten startups
top_ten_success_probabilities = top_ten_startups['success_probability'].tolist()

# Print the success probabilities for the top ten startups
for i, prob in enumerate(top_ten_success_probabilities, start=1):
    print(f"{i}. {prob}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Names of the top ten startups
startup_names = ['Dilithium Networks', 'Photobucket', 'Virident Systems',
                 'Peer39', 'Cozi Group', 'Jingle Networks',
                 'Jumptap', 'Blip', 'ExactTarget', 'MOG']

# Success probabilities for each startup
success_probabilities = [0.8609552376838987] * 10  # Replicating the same probability for all startups

# Create a DataFrame for visualization
df = pd.DataFrame({'Startup Name': startup_names, 'Success Probability': success_probabilities})

# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(x='Success Probability', y='Startup Name', data=df, palette='viridis')
plt.title('Top 10 Startups Likely to Succeed (Equal Probabilities)')
plt.xlabel('Success Probability')
plt.ylabel('Startup Name')
plt.show()
