In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("survey_results_public.csv")
df.head()

In [None]:
df = df[["Country", "EdLevel", "YearsCode", "Employment", "ConvertedCompYearly"]]
df = df.rename({"ConvertedCompYearly":"Salary"}, axis=1)
df.head()

In [None]:
df = df[df["Salary"].notnull()]
df.head()

In [None]:
df.info()

In [None]:
df = df.dropna()
df.isnull().sum()

In [None]:
df = df[df["Employment"] == "Employed, full-time"]
df = df.drop("Employment", axis=1)
df.info()

In [None]:
df['Country'].value_counts()

In [None]:
def shorten_categories(categories, cutoff):
    categorical_map = {}
    for country, count in categories.items():
        if count >= cutoff:
            categorical_map[country] = country  # Keep the country name as it is
        else:
            categorical_map[country] = 'Other'  # Group smaller categories as 'Other'
    return categorical_map

# Apply the mapping to the DataFrame
country_map = shorten_categories(df['Country'].value_counts(), 400)
df['Country'] = df['Country'].map(country_map)

# Display the updated country counts
print(df['Country'].value_counts())


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 7))
df.boxplot('Salary', 'Country', ax=ax)
plt.suptitle('Salary (US$) v Country')
plt.title('')
plt.ylabel('Salary')
plt.xticks(rotation=90)
plt.show()

In [None]:
df = df[df["Salary"] <= 250000]
df = df[df["Salary"] >= 10000]
df = df[df['Country'] != 'Other']

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12,7))
df.boxplot('Salary', 'Country', ax=ax)
plt.suptitle('')
plt.ylabel('Salary')
plt.xticks(rotation=90)
plt.show()

In [None]:
df['YearsCode'].unique()

In [None]:
def clean_experience(x):
    if x=='More than 50 years':
        return 50
    if x=='Less than 1 years':
        return 0.5
df['YearsCode'] = df['YearsCode'].apply(clean_experience)

In [None]:
df['EdLevel'].unique()

In [None]:
def clean_education(x):
    if 'Bachelor’s degree' in x:
        return 'Bachelor’s degree'
    if 'Master’s degree' in x:
        return 'Master’s degree'
    if 'Professional degree' in x:
        return 'Professional degree'
    return 'Less than a Bachelors'
df['EdLevel'] = df['EdLevel'].apply(clean_education)

In [None]:
df['EdLevel'].unique()

In [None]:
from sklearn.preprocessing import LabelEncoder
le_education = LabelEncoder()
df['EdLevel'] = le_education.fit_transform(df['EdLevel'])
df['EdLevel'].unique()

In [None]:
le_country = LabelEncoder()
df['Country'] = le_country.fit_transform(df['Country'])
df['Country'].unique()

In [None]:
x = df.drop('Salary', axis=1)
y = df['Salary']

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd

# Fill missing values with median
x = x.fillna(x.mean())

# One-hot encode categorical variables
x = pd.get_dummies(x, drop_first=True)

# Split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# Initialize and train the model
Linear_reg = LinearRegression()
Linear_reg.fit(x_train, y_train)

# Make predictions
y_pred = Linear_reg.predict(x_test)

# Display predictions for verification
print("Predictions:", y_pred[:10])  # Show the first 10 predictions
print("Actual values:", y_test[:10].values)  # Show the first 10 actual values for comparison


In [None]:
print("Target variable range:", y_train.min(), "to", y_train.max())


In [None]:
y_pred = Linear_reg.predict(x)
y_pred

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
error = np.sqrt(mean_squared_error(y, y_pred))

In [None]:
error

In [None]:
from sklearn.tree import DecisionTreeRegressor
dec_tree_reg = DecisionTreeRegressor(random_state=0)
dec_tree_reg.fit(x, y.values)

In [None]:
y_pred = dec_tree_reg.predict(x)

In [None]:
error = np.sqrt(mean_squared_error(y, y_pred))
print("${:,.02f}".format(error))

In [None]:
from sklearn.ensemble import RandomForestRegressor
random_forest_reg = RandomForestRegressor(random_state=0)
random_forest_reg.fit(x, y.values)

In [None]:
y_pred = random_forest_reg.predict(x)

In [None]:
error = np.sqrt(mean_squared_error(y, y_pred))
print("${:,.02f}".format(error))

In [None]:
from sklearn.model_selection import GridSearchCV
max_depth = [None, 2, 4, 6, 8, 10, 12]
parameters =  {"max_depth": max_depth}
regressor = DecisionTreeRegressor(random_state=0)
gs = GridSearchCV(regressor, parameters, scoring='neg_mean_squared_error')
gs.fit(x, y.values)

In [None]:
regressor = gs.best_estimator_
regressor.fit(x, y.values)
y_pred = regressor.predict(x)
error = np.sqrt(mean_squared_error(y, y_pred))
print("${:,.02f}".format(error))

In [None]:
x

In [None]:
x = np.array([["United States", 'Master’s degree', 15]])
x

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoders
le_country = LabelEncoder()
le_education = LabelEncoder()

# Fit the LabelEncoders on the appropriate columns
le_country.fit(x[:, 0])  # Assuming x[:, 0] contains country data
le_education.fit(x[:, 1])  # Assuming x[:, 1] contains education level data

# Now transform the data
x[:, 0] = le_country.transform(x[:, 0])
x[:, 1] = le_education.transform(x[:, 1])

# Convert the data type of x to float
x = x.astype(float)


In [None]:
y_pred = regressor.predict(x)
y_pred

In [None]:
import pickle

data = {"model": regressor, "le_country": le_country, "le_education": le_education}
# Assuming 'model' is your trained model or pipeline
with open('saved_model.pkl', 'wb') as file:
    pickle.dump(data, file)

In [None]:
with open('saved_model.pkl', 'rb') as file:
    model = pickle.load(file)
regressor_loaded = model['model']
le_country = model["le_country"]
le_education = model["le_education"]

In [None]:
y_pred = regressor_loaded.predict(x)
y_pred

In [None]:
import sklearn
print(sklearn.__version__)