In [53]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [29]:
import pandas as pd

try:
    df = pd.read_csv(r'ind_sch_updated.csv')
    print("--- Successfully loaded ind_sch_updated.csv ---")
except FileNotFoundError:
    print("Error: 'ind_sch_updated.csv' not found in Downloads folder. Please make sure the file exists at the specified path.")
    exit()


--- Successfully loaded ind_sch_updated.csv ---


In [30]:
print("\n--- Initial Data Inspection ---")
print("DataFrame Info (columns, data types, non-null counts):")
df.info()
print("\nFirst 5 Rows of the Raw Data:")
print(df.head())
print("\nChecking for Missing Values:")
print(df.isnull().sum())
print("-" * 40)


--- Initial Data Inspection ---
DataFrame Info (columns, data types, non-null counts):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   fiscal_year  40 non-null     object
 1   school_type  40 non-null     object
 2   value        40 non-null     int64 
 3   unit         40 non-null     object
 4   note         40 non-null     object
dtypes: int64(1), object(4)
memory usage: 1.7+ KB

First 5 Rows of the Raw Data:
  fiscal_year       school_type      value                      unit  \
0     2014-15        Government  139000000  value in absolute number   
1     2014-15  Government Aided   31000000  value in absolute number   
2     2014-15   Private Unaided   71000000  value in absolute number   
3     2014-15            Others    5000000  value in absolute number   
4     2015-16        Government  137000000  value in absolute number   

         

In [31]:
df_processed = df.drop(columns=['unit', 'note'])
print("\n--- Dropped 'unit' and 'note' columns ---")



--- Dropped 'unit' and 'note' columns ---


In [32]:
print("\n--- Encoding Categorical Features ---")


--- Encoding Categorical Features ---


In [33]:
categorical_features = ['fiscal_year', 'school_type']

In [34]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_data = encoder.fit_transform(df_processed[categorical_features])

In [35]:
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_features))

In [36]:
final_df = pd.concat([df_processed[['value']].reset_index(drop=True), encoded_df], axis=1)

print("\n--- Processed DataFrame Head ---")
print("This is how the data looks after cleaning and one-hot encoding:")
print(final_df.head())
print("-" * 40)


--- Processed DataFrame Head ---
This is how the data looks after cleaning and one-hot encoding:
       value  fiscal_year_2014-15  fiscal_year_2015-16  fiscal_year_2016-17  \
0  139000000                  1.0                  0.0                  0.0   
1   31000000                  1.0                  0.0                  0.0   
2   71000000                  1.0                  0.0                  0.0   
3    5000000                  1.0                  0.0                  0.0   
4  137000000                  0.0                  1.0                  0.0   

   fiscal_year_2017-18  fiscal_year_2018-19  fiscal_year_2019-20  \
0                  0.0                  0.0                  0.0   
1                  0.0                  0.0                  0.0   
2                  0.0                  0.0                  0.0   
3                  0.0                  0.0                  0.0   
4                  0.0                  0.0                  0.0   

   fiscal_year_202

In [37]:
print("\n--- Splitting Data into Training and Testing Sets ---")

X = final_df.drop('value', axis=1)

y = final_df['value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Shape of X_train (features for training): {X_train.shape}")
print(f"Shape of y_train (target for training): {y_train.shape}")
print(f"Shape of X_test (features for testing): {X_test.shape}")
print(f"Shape of y_test (target for testing): {y_test.shape}")
print("\n--- Preprocessing Complete ---")



--- Splitting Data into Training and Testing Sets ---
Shape of X_train (features for training): (32, 14)
Shape of y_train (target for training): (32,)
Shape of X_test (features for testing): (8, 14)
Shape of y_test (target for testing): (8,)

--- Preprocessing Complete ---


In [38]:
final_df.to_csv('ind_sch_newprocessed.csv', index=False)
print("\nProcessed data has been saved to 'ind_sch_processed.csv'")


Processed data has been saved to 'ind_sch_processed.csv'


In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [40]:
try:
    df = pd.read_csv(r'ind_sch_updated.csv')  # Use raw string with r''
    print("--- Successfully loaded your file: ind_sch_updated.csv ---")
except FileNotFoundError:
    print("Error: 'ind_sch_updated.csv' not found in the Downloads folder. Please make sure the file exists at the specified path.")
    exit()

--- Successfully loaded your file: ind_sch_updated.csv ---


In [41]:
df_processed = df.drop(columns=['unit', 'note'])
categorical_features = ['fiscal_year', 'school_type']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_data = encoder.fit_transform(df_processed[categorical_features])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_features))
final_df = pd.concat([df_processed[['value']].reset_index(drop=True), encoded_df], axis=1)



In [42]:
X = final_df.drop('value', axis=1)
y = final_df['value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train)
print("--- Preprocessing complete. ---")

    fiscal_year_2014-15  fiscal_year_2015-16  fiscal_year_2016-17  \
39                  0.0                  0.0                  0.0   
6                   0.0                  1.0                  0.0   
25                  0.0                  0.0                  0.0   
9                   0.0                  0.0                  1.0   
13                  0.0                  0.0                  0.0   
31                  0.0                  0.0                  0.0   
34                  0.0                  0.0                  0.0   
8                   0.0                  0.0                  1.0   
17                  0.0                  0.0                  0.0   
24                  0.0                  0.0                  0.0   
0                   1.0                  0.0                  0.0   
33                  0.0                  0.0                  0.0   
5                   0.0                  1.0                  0.0   
11                  0.0           

In [43]:
model = LinearRegression()
print("\n--- Training the Linear Regression model... ---")
model.fit(X_train, y_train)
print("--- Model training complete. ---")


--- Training the Linear Regression model... ---
--- Model training complete. ---


In [44]:
print("\n--- Making predictions on the test data... ---")
y_pred = model.predict(X_test)


--- Making predictions on the test data... ---


In [45]:
print("\n--- Evaluating Model Performance ---")
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse:,.2f}")
print(f"R-squared (R²): {r2:.4f}")


--- Evaluating Model Performance ---
Root Mean Squared Error (RMSE): 2,826,477.79
R-squared (R²): 0.9976


In [46]:
print("\n--- Sample of Predictions vs Actual Values ---")
comparison_df = pd.DataFrame({'Actual Value': y_test, 'Predicted Value': y_pred})
print(comparison_df.head())
print("-" * 40)


--- Sample of Predictions vs Actual Values ---
    Actual Value  Predicted Value
19       4500000     4.252360e+06
16     131000000     1.329627e+08
15       4600000     3.502360e+06
26      81000000     7.966575e+07
4      137000000     1.313119e+08
----------------------------------------


In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

try:
    df = pd.read_csv(r'ind_sch_updated.csv')
except FileNotFoundError:
    print("Error: 'ind_sch_updated.csv' not found.")
    exit()

In [48]:
df_processed = df.drop(columns=['unit', 'note'])
categorical_features = ['fiscal_year', 'school_type']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_data = encoder.fit_transform(df_processed[categorical_features])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_features))
final_df = pd.concat([df_processed[['value']].reset_index(drop=True), encoded_df], axis=1)

X = final_df.drop('value', axis=1)
y = final_df['value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_y_pred = lr_model.predict(X_test)
lr_r2 = r2_score(y_test, lr_y_pred)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_y_pred))


# --- 3. Train and Evaluate an Advanced Model: Random Forest ---
print("--- Training the RandomForestRegressor model... ---")

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
print("--- Model training complete. ---")


--- Training the RandomForestRegressor model... ---
--- Model training complete. ---


In [50]:
rf_y_pred = rf_model.predict(X_test)

In [51]:
rf_r2 = r2_score(y_test, rf_y_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_y_pred))


In [52]:
print("\n--- Model Comparison ---")
print(f"Linear Regression -> R²: {lr_r2:.4f}, RMSE: {lr_rmse:,.2f}")
print(f"Random Forest     -> R²: {rf_r2:.4f}, RMSE: {rf_rmse:,.2f}")
print("-" * 40)



--- Model Comparison ---
Linear Regression -> R²: 0.9976, RMSE: 2,826,477.79
Random Forest     -> R²: 0.9986, RMSE: 2,189,314.43
----------------------------------------
