In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor


In [66]:
# Load the global dataset
file_path_global = r'C:\Users\tyler\OneDrive\Desktop\GitHub Repositories\DataScience2\Exit_Tickets\January31st\ETData.csv'  # Replace with your file path
data_global = pd.read_csv(file_path_global)

# Handle missing values
data_global.dropna(inplace=True)

# Encode categorical variables (if needed)
# Example: data_global = pd.get_dummies(data_global, columns=['Category_Column'])

# Selecting features
features = [
    'Year', 'Adult Mortality (probability of dying between 15 and 60 years per 1000 people)',
    'Infant Deaths (number of infant deaths per 1000 people)', 'Alcohol (average number of litres consumed by a person)', 
    'Hepatitis B (percent of population immunized)', 'Measles (number of reported cases per 1000 people)', 
    'BMI (average BMI for the population)', 'Under-Five Deaths (number per 1000 people)', 
    'Polio (percent of population immunized)', 'Healthcare Spending (percentage of government\'s total budget)', 
    'Diphtheria (percent of population immunized)', 'HIV/AIDS (deaths per 1000 people)', 
    'GDP (billion US Dollars)', 'Population', 'Schooling (average number of years completed)'
]

# Setting the target variable
target = 'Life Expectancy (Years)'  # Replace with your target column name

# Splitting data into features and target
X_global = data_global[features]
y_global = data_global[target]

# Splitting into training and validation sets
X_train_global, X_val_global, y_train_global, y_val_global = train_test_split(X_global, y_global, test_size=0.2, random_state=42)


In [67]:
# Train Linear Regression Model
linear_model = LinearRegression()
linear_model.fit(X_train_global, y_train_global)


In [68]:
# Train Polynomial Regression Model
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train_global)

poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train_global)


In [69]:
# Train Decision Tree Regression Model
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train_global, y_train_global)


In [70]:
# Load the USA dataset
file_path_usa = r'C:\Users\tyler\OneDrive\Desktop\GitHub Repositories\DataScience2\Exit_Tickets\January31st\PredictET.csv'  # Replace with your file path
data_usa = pd.read_csv(file_path_usa)

# Preprocess the USA dataset (assuming the same preprocessing as the global dataset)
X_usa = data_usa[features]


In [71]:
# Convert 'Population' from string to float (removing commas)
data_usa['Population'] = data_usa['Population'].str.replace(',', '').astype(float)

# Ensure the USA data matches the feature set of the global data
X_usa = data_usa[features]

# Preparing the USA data for Polynomial Regression
X_usa_poly = poly.transform(X_usa)

# Making predictions with each model
predictions_linear = linear_model.predict(X_usa)
predictions_poly = poly_model.predict(X_usa_poly)
predictions_tree = tree_model.predict(X_usa)

# Display predictions
print("Linear Regression Prediction:", predictions_linear[0])
print("Polynomial Regression Prediction:", predictions_poly[0])
print("Decision Tree Regression Prediction:", predictions_tree[0])


Linear Regression Prediction: 75.75884134356554
Polynomial Regression Prediction: 94.98949522371183
Decision Tree Regression Prediction: 77.1


In [72]:
# Calculate R-squared on the training set
r_squared_train = linear_model.score(X_train_global, y_train_global)

# Calculate R-squared on the validation set
r_squared_val = linear_model.score(X_val_global, y_val_global)

print(f"Linear Regression Model Performance:")
print(f"R-squared on Training Set: {r_squared_train:.2f}")
print(f"R-squared on Validation Set: {r_squared_val:.2f}")


Linear Regression Model Performance:
R-squared on Training Set: 0.83
R-squared on Validation Set: 0.80
