In [150]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

sns.set_style('darkgrid')
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 20)

In [151]:
raw_df = pd.read_csv('../shared_datasets/survey_results_public.csv')

### Get dataframe with columns important to us:
- **Respondent**: Randomized respondent ID number (not in order of survey response time)
- **ConvertedComp**: Salary converted to annual USD salaries using the exchange rate on 2019-02-01, assuming 12 working months and 50 working weeks.
- **YearsCode**: Including any education, how many years have you been coding?
- **Age**: What is your age (in years)? If you prefer not to answer, you may leave this question blank.
- **OrgSize**: Approximately how many people are employed by the company or organization you work for?
- **Gender**: Which of the following do you currently identify as? Please select all that apply. If you prefer not to answer, you may leave this question blank.
- **Ethnicity**: Which of the following do you identify as? Please check all that apply. If you prefer not to answer, you may leave this question blank.
- **LanguageWorkedWith**: Which of the following programming, scripting, and markup languages have you done extensive development work in over the past year, and which do you want to work in over the next year?  (If you both worked with the language and want to continue to do so, please check both boxes in that row.)
- **Country**: In which country do you currently reside?
- **WorkWeekHrs**: On average, how many hours per week do you work?
- **Extraversion**: Do you prefer online chat or IRL conversations?
- **Dependents**: Do you have any dependents (e.g., children, elders, or others) that you care for?

### Run .py files

In [152]:
%run ../shared_datasets/clean_dataframe.py
%run ../shared_datasets/charlie_functions.py

In [153]:
dataframe_class = CleanDataframe(raw_df)
df = dataframe_class.cleaned_df

In [154]:
df

Unnamed: 0,Respondent,ConvertedComp,WorkWeekHrs,YearsCode,Age,CodeRevHrs,YearsCodePro,learned_code,Assembly,Bash/Shell/PowerShell,C,C#,C++,Clojure,Dart,Elixir,Erlang,F#,Go,HTML/CSS,Java,JavaScript,Kotlin,Objective-C,Other(s):,PHP,Python,R,Ruby,Rust,SQL,Scala,Swift,TypeScript,VBA,WebAssembly,LanguageWorkedWith_Total,Biracial,Black or of African descent,East Asian,Hispanic or Latino/Latina,Middle Eastern,Multiracial,"Native American, Pacific Islander, or Indigenous Australian",South Asian,White or of European descent,"OrgSize_1,000 to 4,999 employees",OrgSize_10 to 19 employees,"OrgSize_10,000 or more employees",OrgSize_100 to 499 employees,OrgSize_2-9 employees,OrgSize_20 to 99 employees,"OrgSize_5,000 to 9,999 employees",OrgSize_500 to 999 employees,"OrgSize_Just me - I am a freelancer, sole proprietor, etc.",Gender_Man,"Gender_Non-binary, genderqueer, or gender non-conforming",Gender_Woman,Country_Afghanistan,Country_Albania,Country_Algeria,Country_Andorra,Country_Argentina,Country_Armenia,Country_Australia,Country_Austria,Country_Azerbaijan,Country_Bahrain,Country_Bangladesh,Country_Belarus,Country_Belgium,Country_Bolivia,Country_Bosnia and Herzegovina,Country_Brazil,Country_Bulgaria,Country_Cambodia,Country_Cameroon,Country_Canada,Country_Chile,Country_China,Country_Colombia,"Country_Congo, Republic of the...",Country_Costa Rica,Country_Croatia,Country_Cuba,Country_Cyprus,Country_Czech Republic,Country_Côte d'Ivoire,Country_Denmark,Country_Dominican Republic,Country_Ecuador,Country_Egypt,Country_El Salvador,Country_Estonia,Country_Ethiopia,Country_Finland,Country_France,Country_Georgia,Country_Germany,Country_Ghana,Country_Greece,Country_Guatemala,Country_Haiti,Country_Honduras,Country_Hong Kong (S.A.R.),Country_Hungary,Country_Iceland,Country_India,Country_Indonesia,Country_Iran,Country_Iraq,Country_Ireland,Country_Israel,Country_Italy,Country_Jamaica,Country_Japan,Country_Jordan,Country_Kazakhstan,Country_Kenya,Country_Kuwait,Country_Kyrgyzstan,Country_Latvia,Country_Lebanon,Country_Libyan Arab Jamahiriya,Country_Lithuania,Country_Luxembourg,Country_Malawi,Country_Malaysia,Country_Maldives,Country_Malta,Country_Mauritius,Country_Mexico,Country_Mongolia,Country_Montenegro,Country_Morocco,Country_Mozambique,Country_Nepal,Country_Netherlands,Country_New Zealand,Country_Nicaragua,Country_Nigeria,Country_Norway,Country_Pakistan,Country_Panama,Country_Paraguay,Country_Peru,Country_Philippines,Country_Poland,Country_Portugal,Country_Qatar,Country_Republic of Korea,Country_Republic of Moldova,Country_Romania,Country_Russian Federation,Country_Saint Vincent and the Grenadines,Country_Saudi Arabia,Country_Senegal,Country_Serbia,Country_Singapore,Country_Slovakia,Country_Slovenia,Country_Somalia,Country_South Africa,Country_South Korea,Country_Spain,Country_Sri Lanka,Country_Sudan,Country_Swaziland,Country_Sweden,Country_Switzerland,Country_Taiwan,Country_Thailand,Country_The former Yugoslav Republic of Macedonia,Country_Trinidad and Tobago,Country_Tunisia,Country_Turkey,Country_Uganda,Country_Ukraine,Country_United Arab Emirates,Country_United Kingdom,Country_United Republic of Tanzania,Country_United States,Country_Uruguay,Country_Uzbekistan,"Country_Venezuela, Bolivarian Republic of...",Country_Viet Nam,Country_Yemen,Country_Zimbabwe,Extraversion_In real life (in person),Extraversion_Neither,Extraversion_Online,Dependents_No,Dependents_Yes,Trans_No,Trans_Yes,EdLevel_Associate degree,"EdLevel_Bachelor’s degree (BA, BS, B.Eng., etc.)",EdLevel_I never completed any formal education,"EdLevel_Master’s degree (MA, MS, M.Eng., MBA, etc.)","EdLevel_Other doctoral degree (Ph.D, Ed.D., etc.)",EdLevel_Primary/elementary school,"EdLevel_Professional degree (JD, MD, etc.)","EdLevel_Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)",EdLevel_Some college/university study without earning a degree,Student_No,"Student_Yes, full-time","Student_Yes, part-time",MainBranch_I am a developer by profession,"MainBranch_I am not primarily a developer, but I write code sometimes as part of my work",WorkLoc_Home,WorkLoc_Office,"WorkLoc_Other place, such as a coworking space or cafe","WorkPlan_There is a schedule and/or spec (made by me or by a colleague), and I follow it very closely","WorkPlan_There is a schedule and/or spec (made by me or by a colleague), and my work somewhat aligns",WorkPlan_There's no schedule or spec; I work on what seems most important or urgent,ImpSyn_A little above average,ImpSyn_A little below average,ImpSyn_Average,ImpSyn_Far above average,ImpSyn_Far below average,Employment_Employed full-time,Employment_Employed part-time,"Employment_Independent contractor, freelancer, or self-employed",OpSys_BSD,OpSys_Linux-based,OpSys_MacOS,OpSys_Windows,BetterLife_No,BetterLife_Yes,ResumeUpdate_I had a negative experience or interaction at work,"ResumeUpdate_I heard about a job opportunity (from a recruiter, online job posting, etc.)",ResumeUpdate_I was preparing for a job search,"ResumeUpdate_My job status changed (promotion, new job, etc.)",ResumeUpdate_Re-entry into the workforce,"ResumeUpdate_Something else changed (education, award, media, etc.)",LastHireDate_1-2 years ago,LastHireDate_3-4 years ago,LastHireDate_I've never had a job,LastHireDate_Less than a year ago,LastHireDate_More than 4 years ago,LastHireDate_NA - I am an independent contractor or self employed,JobSat_Neither satisfied nor dissatisfied,JobSat_Slightly dissatisfied,JobSat_Slightly satisfied,JobSat_Very dissatisfied,JobSat_Very satisfied,CareerSat_Neither satisfied nor dissatisfied,CareerSat_Slightly dissatisfied,CareerSat_Slightly satisfied,CareerSat_Very dissatisfied,CareerSat_Very satisfied,JobSeek_I am actively looking for a job,JobSeek_I am not interested in new job opportunities,"JobSeek_I’m not actively looking, but I am open to new opportunities",Hobbyist_No,Hobbyist_Yes,CompFreq_Monthly,CompFreq_Weekly,CompFreq_Yearly,SOVisitFreq_A few times per month or weekly,SOVisitFreq_A few times per week,SOVisitFreq_Daily or almost daily,SOVisitFreq_Less than once per month or monthly,SOVisitFreq_Multiple times per day,SOPartFreq_A few times per month or weekly,SOPartFreq_A few times per week,SOPartFreq_Daily or almost daily,SOPartFreq_I have never participated in Q&A on Stack Overflow,SOPartFreq_Less than once per month or monthly,SOPartFreq_Multiple times per day,FizzBuzz_No,FizzBuzz_Yes,Continent_Africa,Continent_Asia,Continent_Europe,Continent_North America,Continent_Oceania Australia,Continent_South America,Gender_Woman:Dependents_Yes,White or of European descent:Gender_Man,Biracial:Gender_Woman,Black or of African descent:Gender_Woman,East Asian:Gender_Woman,Hispanic or Latino/Latina:Gender_Woman,Middle Eastern:Gender_Woman,Multiracial:Gender_Woman,"Native American, Pacific Islander, or Indigenous Australian:Gender_Woman",South Asian:Gender_Woman,ImpSyn_Far above average:Gender_Man
13,14,57060.0,40.0,13,31.0,4.0,2,18.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,17,65277.0,45.0,5,29.0,5.0,2,24.0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,4,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,19,31140.0,8.0,14,31.0,3.0,13,17.0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,6,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
22,23,69000.0,40.0,3,22.0,8.0,1,19.0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,6,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,29,100000.0,40.0,4,32.0,2.0,2,28.0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88296,88848,13272.0,40.0,12,28.0,4.0,8,16.0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88298,88850,120000.0,40.0,14,34.0,3.0,11,20.0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88301,88853,110000.0,36.0,8,27.0,4.0,6,19.0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,5,0,0,0,1,0,1,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
88323,88876,180000.0,40.0,8,23.0,3.0,2,15.0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,7,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# df[df['ConvertedComp'] < 2000].sort_values('ConvertedComp', ascending=True)
# df_cleaned[df_cleaned['ConvertedComp'] > 490000].sort_values('ConvertedComp', ascending=True)

In [None]:
# list(df.columns)

In [None]:
# len(list(df.columns))

In [None]:
# corr = df.corr()

In [None]:
# fig, ax = plt.subplots(figsize=(10,10))

# ax = sns.heatmap(
#     corr,
#     vmin=-1, vmax=1, center=0,
#     cmap=sns.diverging_palette(20, 220, n=200),
#     square=True
# )
# ax.set_xticklabels(
#     ax.get_xticklabels(),
#     rotation=45,
#     horizontalalignment='right'
# );


In [None]:
# generate random data-set
np.random.seed(0)
x = df[['Respondent']]
y = df[['ConvertedComp']]

# Model initialization
regression_model = LinearRegression()
# Fit the data(train the model)
regression_model.fit(x, y)
# Predict
y_predicted = regression_model.predict(x)

# model evaluation
rmse = mean_squared_error(y, y_predicted)
r2 = r2_score(y, y_predicted)

# printing values
print('Slope:' ,regression_model.coef_)
print('Intercept:', regression_model.intercept_)
print('Root mean squared error: ', rmse)
print('R2 score: ', r2)

# plotting values

# data points
# plt.scatter(x, y, s=10)
# plt.xlabel(x.columns[0])
# plt.ylabel(y.columns[0])

# predicted values
# plt.plot(x, y_predicted, color='r')
# plt.show()

In [None]:
# df[df['ConvertedComp'] == 0].sort_values('ConvertedComp', ascending=True)

In [None]:
# scratch work
# cleaned_df = df[['Respondent','ConvertedComp',
#                  'YearsCode','Age','WorkWeekHrs']].dropna()

# df_test = cleaned_df.groupby('Gender')['Respondent'].nunique()
# df_test

# cleaned_df[cleaned_df['ConvertedComp'] >= 1000000].sort_values('Age', ascending=True)
# df_converted_comp = pd.DataFrame(cleaned_df[['Respondent', 'YearsCode', 'Age']])
# troll = df_converted_comp[ (df_converted_comp['YearsCode'] < 10) ].index

# cleaned_df['ConvertedComp'].describe()
# test = cleaned_df.sort_values('Age', ascending=True)[:650]
# test
# cleaned_df[(cleaned_df['Age'] < 18)].sort_values('Age')
# cleaned_df[(cleaned_df['Age'] < 18)].sort_values('Age').describe()
# list_of_unique_countries = cleaned_df['Country'].unique()

# list_of_unique_continent = cleaned_df['Continent'].unique()
# foo = cleaned_df[ cleaned_df['Respondent'] == 1438 ]
# foo = cleaned_df.sort_values('WorkWeekHrs', ascending=False)
# foo
# foo.describe()
# df_test = cleaned_df.groupby('WorkWeekHrs')['Respondent'].nunique()
# df_test
# x = np.random.rand(100, 1)
# y = 2 + 3 * x + np.random.rand(100, 1)
# x = cleaned_df.drop(columns=['ConvertedComp'])
# x = cleaned_df[cleaned_df['calc_age'] > 18]
# y = cleaned_df[(cleaned_df['calc_age'] > 18) & (cleaned_df['ConvertedComp'])]

# Charlie Function Start

In [None]:
X = df.drop(columns=['Respondent','ConvertedComp'])
Y = df['ConvertedComp']

sm_performance,sm_variables,sm_outcomes = sm_OLS(X,Y)

In [None]:
# make a deep copy of our X values for the recursive function
X_recursive = pd.DataFrame.copy(X,deep=True)

In [None]:
# recursive_performance,recursive_variables,recursive_outcomes = ols_recursion(X_recursive,Y)

In [None]:
comparison = pd.concat((sm_performance.transpose(),recursive_performance.transpose()),ignore_index=True)
comparison.index = ['294 predictors','118 predictors']
# comparison

In [None]:
# X_recursive

In [None]:
subs = [(' ', '_'), ('.', ''), ("’", ""), ("'", ""),
        (';', '_'), ('/', '_'), ('(', ''), (')', ''),
        (':', ''), ('-', '_'), (',', '_'), 
        ('___', '_'), ('__', '_'),
        ('#', 'sharp'), ('+', 'plus'), ('&', 'and')
        ]

def col_formatting(col):
    for old, new in subs:
        col = col.replace(old,new)
    return col

X_recursive.columns = [col_formatting(col) for col in X_recursive.columns]

# list(X_recursive.columns)

In [None]:
x_cols = list(X_recursive.columns)

# Charlie Function End

### Dropping 'Respondent'

In [155]:
df = df.drop(['Respondent'], axis=1)

### Manipulate/Normalize column names so it can be used to run OLS Regression

In [157]:
subs = [(' ', '_'), ('.', ''), ("’", ""), ("'", ""),
        (';', '_'), ('/', '_'), ('(', ''), (')', ''),
        (':', ''), ('-', '_'), (',', '_'), 
        ('___', '_'), ('__', '_'),
        ('#', 'sharp'), ('+', 'plus'), ('&', 'and')
        ]

def col_formatting(col):
    for old, new in subs:
        col = col.replace(old,new)
    return col

df.columns = [col_formatting(col) for col in df.columns]

### Define the target (outcome) and predictors

In [158]:
outcome = 'ConvertedComp'
x_cols = list(df.columns)
x_cols.remove(outcome)

### Splitting dataframe into Train and Test

In [144]:
train, test = train_test_split(df)

In [145]:
print(len(train), len(test))
# train.head()

13984 4662


In [146]:
# test.head()

### Create function to fit ols model

In [147]:
def fit_ols_model(x_cols, outcome):
    # make df of the model.summary() 
    predictors = '+'.join(x_cols)
    formula = outcome + '~' + predictors
    model = ols(formula=formula, data=train).fit()
    summary = model.summary()
    p_table = summary.tables[1]
    p_table = pd.DataFrame(p_table.data)
    p_table.columns = p_table.iloc[0]
    p_table = p_table.drop(0)
    p_table = p_table.set_index(p_table.columns[0])
    p_table['P>|t|'] = p_table['P>|t|'].astype(float)
    
    x_cols_to_keep = list(p_table[p_table['P>|t|'] < 0.05].index)
    if 'Intercept' in x_cols_to_keep:
#         print("removed Intercept from: fit_ols_model()")
        x_cols_to_keep.remove('Intercept')
#     else:
#         print("skipping... no Intercept found")
        
    x_cols_to_drop = list(p_table[p_table['P>|t|'] > 0.05].index)

    print(f"p_table rows: {len(p_table)} | x_cols_to_drop: {len(x_cols_to_drop)} | x_cols_to_keep: {len(x_cols_to_keep)}")
    print(f"{summary.tables[0].data[0][2].strip()} {summary.tables[0].data[0][3].strip()}")
    print("")
    return summary, x_cols_to_drop, x_cols_to_keep

### Call `fit_ols_model()` to view model

In [162]:
summary, x_drop, x_keep = fit_ols_model(x_cols, outcome)
# summary

p_table rows: 288 | x_cols_to_drop: 150 | x_cols_to_keep: 137
R-squared: 0.627



### Create `while loop` to remove uninfluential features.
- It will remove any features with a p-value less than 0.05

In [148]:
while len(x_drop)>0:
    summary, x_drop, x_keep = fit_ols_model(x_keep, outcome)
    if 'Intercept' in x_drop:
#         print("removed Intercept from: while loop")
        x_drop.remove('Intercept')
#     print(f"{summary.tables[0].data[0][2].strip()} {summary.tables[0].data[0][3].strip()}")
#     print("")
    summary, x_drop, x_keep

p_table rows: 288 | x_cols_to_drop: 150 | x_cols_to_keep: 137
R-squared: 0.627

p_table rows: 138 | x_cols_to_drop: 20 | x_cols_to_keep: 117
R-squared: 0.620

p_table rows: 118 | x_cols_to_drop: 6 | x_cols_to_keep: 110
R-squared: 0.619

p_table rows: 111 | x_cols_to_drop: 1 | x_cols_to_keep: 109
R-squared: 0.618

p_table rows: 110 | x_cols_to_drop: 1 | x_cols_to_keep: 108
R-squared: 0.618

p_table rows: 109 | x_cols_to_drop: 0 | x_cols_to_keep: 108
R-squared: 0.618



### Investigating Multicollinearity

In [164]:
# identify multicollinearity
X = df[x_keep]
vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
list(zip(x_cols, vif))

  vif = 1. / (1. - r_squared_i)


[('WorkWeekHrs', 1.1157456178504515),
 ('YearsCode', 7.573530264559225),
 ('Age', 1.0608170621445587),
 ('CodeRevHrs', 6.75895028563856),
 ('YearsCodePro', 1.449669848823533),
 ('learned_code', 1.4035755552235258),
 ('Assembly', 1.049148483970924),
 ('Bash_Shell_PowerShell', 1.0397723456956307),
 ('C', 1.170698223103257),
 ('Csharp', 1.5047222073398097),
 ('Cplusplus', 1.3693531766237639),
 ('Clojure', 1.0789367981041704),
 ('Dart', 1.3184846453459778),
 ('Elixir', 1.190914705586181),
 ('Erlang', 1.0965225439435053),
 ('Fsharp', 1.2321009292061764),
 ('Go', 1.1131355254575797),
 ('HTML_CSS', 2.7431792287746433),
 ('Java', 1.284612307383661),
 ('JavaScript', 1.0122772474853898),
 ('Kotlin', 2.3593646209280887),
 ('Objective_C', 1.4106718505054718),
 ('Others', 1.3312097169530879),
 ('PHP', 1.5333859173420021),
 ('Python', 1.3721473480071933),
 ('R', 1.6689517896469712),
 ('Ruby', 1.189806986180288),
 ('Rust', 1.2758504326006308),
 ('SQL', 1.952059876308152),
 ('Scala', 4.235409780056347

In [165]:
# Subset features based on multicollinearity
vif_scores = list(zip(x_keep, vif))
x_keep = [x for x,vif in vif_scores if vif < 5]
print(len(vif_scores), len(x_keep))

137 117


In [None]:
# Refit model with subset features
# Refit model with subset features
predictors = '+'.join(x_cols)
formula = outcome + "~" + predictors
model = ols(formula=formula, data=train).fit()
model.summary()

In [None]:
# Check that the residuals are normally distributed
fig = sm.graphics.qqplot(model.resid, dist=stats.norm, line='45', fit=True)

In [None]:
# heck that the residuals are homoscedastic
plt.scatter(model.predict(train[x_cols]), model.resid)
plt.plot(model.predict(train[x_cols]), [0 for i in range(len(train))])

In [None]:
# Check for outliers
df.ConvertedComp.hist()

In [None]:
# Remove extreme outliers
for i in range(80,100):
    q = i/100
    print("{} percentile: {}".format(q, df.ConvertedComp.quantile(q=q)))


# Rerun the model
orig_tot = len(df)
df = df[df.ConvertedComp < 200000] # Subsetting to remove extreme outliers
print('Percent removed:', (orig_tot -len(df))/orig_tot)
df.ConvertedComp = df.ConvertedComp.map(np.log) # Applying a log transformation
train, test = train_test_split(df)

# Refit model with subset features
predictors = '+'.join(x_cols)
formula = outcome + "~" + predictors
model = ols(formula=formula, data=train).fit()
model.summary()

In [None]:
# Check normality assumption
fig = sm.graphics.qqplot(model.resid, dist=stats.norm, line='45', fit=True)

In [None]:
#Check the Homoscedasticity Assumption
plt.scatter(model.predict(train[x_cols]), model.resid)
plt.plot(model.predict(train[x_cols]), [0 for i in range(len(train))])

In [None]:
for i in range(80,100):
    q = i/100
    print("{} percentile: {}".format(q, df.ConvertedComp.quantile(q=q)))

df.ConvertedComp.hist()

In [None]:
df = df[df.ConvertedComp <= 11]
train, test = train_test_split(df)

# Refit model with subset features
predictors = '+'.join(x_cols)
formula = outcome + '~' + predictors
model = ols(formula=formula, data=train).fit()
model.summary()

In [None]:
# Check the Normality Assumption
fig = sm.graphics.qqplot(model.resid, dist=stats.norm, line='45', fit=True)

In [None]:
plt.scatter(model.predict(train[x_cols]), model.resid)
plt.plot(model.predict(train[x_cols]), [0 for i in range(len(train))])

In [None]:
x_cols