Importing Libraries

In [12]:
import pandas as pd

Importing Dataset

In [13]:
dataset = pd.read_csv('MF_India_AI.csv')
dataset.columns.to_list
dataset.replace('-', 0, inplace=True)
dataset.fillna(0, inplace=True)
dataset = dataset.apply(pd.to_numeric, errors='coerce')

Data Pre-Processing

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = dataset

# Scale numerical variables
scaler = StandardScaler()
numerical_cols = ['min_sip', 'min_lumpsum', 'expense_ratio', 'fund_size_cr', 'fund_age_yr',
                  'sortino', 'alpha', 'sd', 'beta', 'sharpe', 'returns_1yr', 'returns_3yr', 'returns_5yr']
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Perform one-hot encoding
data = pd.get_dummies(data, columns=['fund_manager', 'amc_name', 'category', 'sub_category'])
data.replace('True', 1, inplace=True)
data.replace('False', 0, inplace=True)
# Split the data into training and testing sets
X = data.drop(columns=['scheme_name'])
y = data['scheme_name']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train)

      min_sip  min_lumpsum  expense_ratio  fund_size_cr  fund_age_yr  \
239 -1.161380    -1.172853       0.781898      2.078307     0.636828   
446  1.277924     0.776950      -0.629286     -0.294529     0.636828   
334 -0.077245    -1.172853      -1.272620      1.257358    -1.636761   
598  1.277924     8.735329      -0.961329     -0.474825     0.636828   
533 -0.077245    -1.208666      -0.608533     -0.519830    -1.257829   
..        ...          ...            ...           ...          ...   
71   1.277924     0.776950      -1.044340      0.026354    -1.636761   
106 -0.619312    -0.814726       0.138564     -0.461450     0.636828   
270 -0.077245     0.776950      -0.878319     -0.503389     0.636828   
435  1.277924     0.776950       0.055553     -0.363499     0.636828   
102 -0.619312     0.776950      -0.463265     -0.481931    -0.878898   

      sortino     alpha        sd      beta    sharpe  risk_level  rating  \
239  0.584831  0.639206  0.388403  0.070034  0.760942     

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv("MF_India_AI.csv")

# Keep only the required features
data = data[['amc_name', 'min_sip', 'category', 'scheme_name']]

# Scale numerical variables
scaler = StandardScaler()
data['min_sip'] = scaler.fit_transform(data[['min_sip']])

# Perform one-hot encoding for categorical variables
data = pd.get_dummies(data, columns=['amc_name', 'category'])

# Split the data into training and testing sets
X = data.drop(columns=['scheme_name'])
y = data['scheme_name']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
rf_model = RandomForestClassifier(n_estimators=50, random_state=10)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)
test = X_test.iloc[[0]]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*38)


Accuracy: 0.9325153374233129


In [16]:
def process_input(input_data):
    # Create a dictionary to map amc_names and categories to their respective one-hot encoded columns
    amc_names = [
        'Aditya Birla Sun Life Mutual Fund', 'Axis Mutual Fund', 'Bandhan Mutual Fund', 'Bank of India Mutual Fund',
        'Baroda BNP Paribas Mutual Fund', 'Canara Robeco Mutual Fund', 'DSP Mutual Fund', 'Edelweiss Mutual Fund',
        'Franklin Templeton Mutual Fund', 'HDFC Mutual Fund', 'HSBC Mutual Fund', 'ICICI Prudential Mutual Fund',
        'IDBI Mutual Fund', 'IIFL Mutual Fund', 'ITI Mutual Fund', 'Indiabulls Mutual Fund', 'Invesco Mutual Fund',
        'JM Financial Mutual Fund', 'Kotak Mahindra Mutual Fund', 'L&T Mutual Fund', 'LIC Mutual Fund',
        'Mahindra Manulife Mutual Fund', 'Mirae Asset Mutual Fund', 'Motilal Oswal Mutual Fund', 'Navi Mutual Fund',
        'Nippon India Mutual Fund', 'PGIM India Mutual Fund', 'PPFAS Mutual Fund', 'Quant Mutual Fund',
        'Quantum Mutual Fund', 'SBI Mutual Fund', 'Shriram Mutual Fund', 'Sundaram Mutual Fund', 'Tata Mutual Fund',
        'Taurus Mutual Fund', 'Trust Mutual Fund', 'UTI Mutual Fund', 'Union Mutual Fund', 'WhiteOak Capital Mutual Fund'
    ]
    categories = ['Debt', 'Equity', 'Hybrid', 'Other', 'Solution Oriented']

    # Initialize one-hot encoded columns with zeros
    one_hot_columns = [0] * (len(amc_names) + len(categories) + 1)  # +1 for min_sip

    # Update one-hot encoded columns based on input data
    amc_name = input_data[0]
    if amc_name in amc_names:
        amc_index = amc_names.index(amc_name)
        one_hot_columns[amc_index] = 1

    min_sip = input_data[1]
    one_hot_columns[-1] = min_sip

    category = input_data[2]
    if category in categories:
        category_index = len(amc_names) + categories.index(category)
        one_hot_columns[category_index] = 1

    return one_hot_columns

# Example input
input_data = ['Aditya Birla Sun Life Mutual Fund', 100, 'Debt']

# Process input
processed_input = process_input(input_data)
print(processed_input)


[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 100]


In [17]:
import numpy as np

# Reshape processed_input as (1, -1)
processed_input_reshaped = np.array(processed_input).reshape(1, -1)

# Make prediction
prediction = rf_model.predict(processed_input_reshaped)
print(prediction)

['LIC MF Children’s Gift Fund']




In [18]:
import pickle
with open('mutual_funds.pkl', 'wb') as f:
    pickle.dump(rf_model, f)