In [None]:
import gradio as gr

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd

import matplotlib as plt
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder

from sklearn.tree import DecisionTreeRegressor

data = pd.read_csv('/content/Unicorntablecsv.csv')
def suitable_investor(company, sector, location):
    features = ['Company', 'Sector', 'Location', 'Select Investors']

    # Concatenate the new entry with the original dataset
    new_entry = pd.DataFrame({
        'Company': [company],
        'Sector': [sector],
        'Location': [location],
        'Select Investors': [''],  # Since this field is not provided by the user, use an empty string
    })
    data_with_new_entry = pd.concat([data, new_entry], ignore_index=True)

    # Combine relevant features into a single string for each startup
    data_with_new_entry['Features'] = data_with_new_entry[features].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

    # Apply TF-IDF vectorization to convert text into numerical features
    vectorizer = TfidfVectorizer()
    features_matrix = vectorizer.fit_transform(data_with_new_entry['Features'])

    # Compute the cosine similarity matrix
    cosine_similarities = linear_kernel(features_matrix, features_matrix)

    # Get the index of the startup for which you want to recommend investors
    target_index = len(data_with_new_entry) - 1  # Index of the new entry

    # Get the top 5 most similar startups
    similar_startups_indices = cosine_similarities[target_index].argsort()[::-1][1:6]

    # Get the names of the recommended investors
    recommended_investors = data_with_new_entry.loc[similar_startups_indices]['Select Investors']

    return recommended_investors[0:1]

data = pd.read_csv('/content/Unicorntablecsv.csv')
def predict_valuation(sector, subsector, entryvaluation, entry, location, investors):

    features = data[['Sector', 'Sub-Sector', 'Entry Valuation^^ ($B)', 'Entry', 'Location']]
    target = data[['Valuation ($B)']].astype('float')
    input_data = pd.DataFrame({
        'Sector': [sector],
        'Sub-Sector': [subsector],
        'Entry Valuation^^ ($B)': [float(entryvaluation)],
        'Entry': [entry],
        'Location': [location]
    })

    X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    X_test = X_test._append(input_data)

    target_encoder = TargetEncoder()
    X_train_encoded = target_encoder.fit_transform(X_train, Y_train)
    X_test_encoded = target_encoder.transform(X_test)

    model = GradientBoostingRegressor(random_state=42)
    model.fit(X_train_encoded, Y_train.values.ravel())
    Y_pred = model.predict(X_test_encoded)

    value = Y_pred[len(Y_pred)-1]
    rounded_number = round(value / 1000000, 1)
    formatted_value = "${} million".format(rounded_number)
    return (formatted_value)

df = pd.read_csv('/content/startupdata.csv')
def ret_predicted_funding(year, domain, city, stage):
    # Split the data into features (X) and target (y)
    X = df[['Founded', 'Domain', 'City', 'Stage']]  # Features
    y = df['Amount'].astype('float')  # Target variable
    input_data = pd.DataFrame({
        'Founded': [float(year)],
        'Domain': [domain],
        'City': [city],
        'Stage': [stage]
    })

    # Perform train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_test = X_test._append(input_data)

    # Identify and normalize outliers using z-score
    grouped_median = df.groupby('Domain')['Amount'].transform('median')
    outliers = (np.abs(y_train - grouped_median) > 3 * df.groupby('Domain')['Amount'].transform('std'))
    y_train[outliers] = grouped_median[outliers]

    # Encode categorical variables using target encoding
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    target_encoder = TargetEncoder(cols=categorical_cols)
    X_train_encoded = target_encoder.fit_transform(X_train, y_train)
    X_test_encoded = target_encoder.transform(X_test)
    ip_encoded = [X_test_encoded.index[-1]]

    # Feature selection
    selector = SelectKBest(score_func=f_regression, k=4)
    selector.fit(X_train_encoded, y_train)

    print("Feature Scores:")
    feature_scores = pd.DataFrame({'Feature': X_train_encoded.columns, 'Score': selector.scores_})

    # Select the top k features
    k = 4
    selected_features = feature_scores.nlargest(k, 'Score')['Feature'].tolist()
    X_train_selected = X_train_encoded[selected_features]
    X_test_selected = X_test_encoded[selected_features]
    # print(X_test_selected.keys())
    # ip_encoded = X_test_selected[] #len(X_test_encoded)-1]

    # Model Complexity and Hyperparameter Tuning
    # model = LinearRegression()
    model = RandomForestRegressor(max_depth=10, min_samples_split=4, n_estimators=100)

    # Train the model on the selected features
    model.fit(X_train_selected, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test_selected)
    #output_pred = model.predict(ip_encoded)
    value = y_pred[len(y_pred)-1]
    rounded_number = round(value / 1000000, 1)
    formatted_value = "${} million".format(rounded_number)
    return (formatted_value)

data = pd.read_csv('/content/Unicorntablecsv.csv')
def get_time_to_unicorn(founded, sector, subsector, entryvaluation, location):
    #Select relevant features for the model
    features = data[['Founded', 'Sector', 'Sub-Sector', 'Entry Valuation^^ ($B)', 'Location']]
    target = data[['Entry']]
    # target = pd.to_datetime(col_target)
    input_data = pd.DataFrame({
        'Founded':[founded],
        'Sector': [sector],
        'Sub-Sector': [subsector],
        'Entry Valuation^^ ($B)': [float(entryvaluation)],
        'Location': [location]
    })

    X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    X_test = X_test._append(input_data)

    target_encoder = TargetEncoder()
    X_train_encoded = target_encoder.fit_transform(X_train, Y_train)
    X_test_encoded = target_encoder.transform(X_test)

    # Create and train the decision tree regression model
    model = DecisionTreeRegressor()
    model.fit(X_train_encoded, Y_train)

    # Make predictions on the testing set
    predictions = model.predict(X_test_encoded)
    return (predictions[len(predictions)-1])

In [None]:
# Define the Gradio Interfaces for each function
interface1 = gr.Interface(
    fn=suitable_investor,
    inputs=["text", "text", "text"],
    outputs="text",
    title="Predicting Most Suitable Investors",
    description="Enter the Company, Sector, and Location for the new entry and click 'Go'.",
)

interface2 = gr.Interface(
    fn=predict_valuation,
    inputs=["text", "text", "text", "text", "text", "text"],
    outputs="text",
    title="Predicting Valuation of Startup Post Unicorn Status",
    description="Enter the sector, sub-sector, entry valuation, year of entry, location and investors for the new entry and click 'Go'.",
)

interface3 = gr.Interface(
    fn=ret_predicted_funding,
    inputs=["text", "text", "text", "text"],
    outputs="text",
    title="Predicting Expected Amount of Funding.",
    description="Enter the entry year, domain, location and stage of funding for the new entry and click 'Go'.",
)

interface4 = gr.Interface(
    fn=get_time_to_unicorn,
    inputs=["text", "text", "text", "text", "text"],
    outputs="text",
    title="Predicting Time to Unicorn Status",
    description="Enter the entry year, sector, sub-sector, entry-valuation and location for the new entry and click 'Go'.",
)

In [None]:
interface1.launch()

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>



In [None]:
interface2.launch()

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>



In [None]:
interface3.launch()

In [None]:
interface4.launch()

In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.1-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.1
