In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from sklearn.metrics import accuracy_score
%matplotlib inline

In [3]:
df = pd.read_csv("./survey_results_public.csv")

In [4]:
df.head()

Unnamed: 0,Respondent,Professional,ProgramHobby,Country,University,EmploymentStatus,FormalEducation,MajorUndergrad,HomeRemote,CompanySize,...,StackOverflowMakeMoney,Gender,HighestEducationParents,Race,SurveyLong,QuestionsInteresting,QuestionsConfusing,InterestedAnswers,Salary,ExpectedSalary
0,1,Student,"Yes, both",United States,No,"Not employed, and not looking for work",Secondary school,,,,...,Strongly disagree,Male,High school,White or of European descent,Strongly disagree,Strongly agree,Disagree,Strongly agree,,
1,2,Student,"Yes, both",United Kingdom,"Yes, full-time",Employed part-time,Some college/university study without earning ...,Computer science or software engineering,"More than half, but not all, the time",20 to 99 employees,...,Strongly disagree,Male,A master's degree,White or of European descent,Somewhat agree,Somewhat agree,Disagree,Strongly agree,,37500.0
2,3,Professional developer,"Yes, both",United Kingdom,No,Employed full-time,Bachelor's degree,Computer science or software engineering,"Less than half the time, but at least one day ...","10,000 or more employees",...,Disagree,Male,A professional degree,White or of European descent,Somewhat agree,Agree,Disagree,Agree,113750.0,
3,4,Professional non-developer who sometimes write...,"Yes, both",United States,No,Employed full-time,Doctoral degree,A non-computer-focused engineering discipline,"Less than half the time, but at least one day ...","10,000 or more employees",...,Disagree,Male,A doctoral degree,White or of European descent,Agree,Agree,Somewhat agree,Strongly agree,,
4,5,Professional developer,"Yes, I program as a hobby",Switzerland,No,Employed full-time,Master's degree,Computer science or software engineering,Never,10 to 19 employees,...,,,,,,,,,,


In [5]:
df_2 = df[['HomeRemote', 'Country', 'YearsCodedJobPast', 'JobSatisfaction', 'Salary', 'DeveloperType']]
#df_3 = df_3.dropna(subset=['YearsCodedJobPast'], axis=0)
df_clean = df_2.fillna(method='ffill')
df_clean = df_clean.dropna(subset=['YearsCodedJobPast'], axis=0)

In [6]:
#df_clean = df_clean[df_clean['HomeRemote'] == 'All or almost all the time (I\'m full-time remote)']


In [7]:
df_clean = df_clean.replace('All or almost all the time (I\'m full-time remote)', 'Always remote')

In [8]:
df_clean.head()

Unnamed: 0,HomeRemote,Country,YearsCodedJobPast,JobSatisfaction,Salary,DeveloperType
12,A few days each month,United Kingdom,17 to 18 years,6.0,113750.0,Web developer
13,"Less than half the time, but at least one day ...",Germany,17 to 18 years,6.0,113750.0,Web developer
14,Always remote,United Kingdom,17 to 18 years,8.0,100000.0,Embedded applications/devices developer
15,Always remote,United States,17 to 18 years,8.0,100000.0,Desktop applications developer
16,Always remote,United Kingdom,17 to 18 years,8.0,100000.0,Web developer


In [9]:
df_clean.isnull().mean()

HomeRemote           0.0
Country              0.0
YearsCodedJobPast    0.0
JobSatisfaction      0.0
Salary               0.0
DeveloperType        0.0
dtype: float64

In [10]:

remote_by_country = df_clean.groupby('Country')['HomeRemote'].value_counts().unstack()['Always remote']
print(remote_by_country)


Country
Afghanistan             14.0
Aland Islands            1.0
Albania                 13.0
Algeria                  3.0
American Samoa           NaN
                        ... 
Virgin Islands (USA)     2.0
Yemen                    NaN
Zaire                    NaN
Zambia                   2.0
Zimbabwe                 4.0
Name: Always remote, Length: 201, dtype: float64


In [11]:
remote_by_experience = df_clean.groupby('YearsCodedJobPast')['HomeRemote'].value_counts().unstack()['Always remote']
print(remote_by_experience)

# Plot remote work preference by experience


YearsCodedJobPast
1 to 2 years        412
10 to 11 years      371
11 to 12 years      148
12 to 13 years       94
13 to 14 years      104
14 to 15 years      195
15 to 16 years      259
16 to 17 years      114
17 to 18 years      130
18 to 19 years       71
19 to 20 years       81
2 to 3 years        506
20 or more years    947
3 to 4 years        403
4 to 5 years        435
5 to 6 years        213
6 to 7 years        162
7 to 8 years        293
8 to 9 years        190
9 to 10 years       367
Less than a year    303
Name: Always remote, dtype: int64


In [12]:
df_clean['DeveloperType'] = df_clean['DeveloperType'].str.split(';')
df_exploded = df_clean.explode('DeveloperType')
remote_by_devtype = df_exploded.groupby('DeveloperType')['HomeRemote'].value_counts().unstack()['Always remote']
print(remote_by_devtype)

DeveloperType
 Data scientist                                            457.0
 Database administrator                                   1049.0
 Desktop applications developer                           1465.0
 DevOps specialist                                         680.0
 Developer with a statistics or mathematics background     562.0
 Embedded applications/devices developer                   383.0
 Graphic designer                                          409.0
 Graphics programming                                      345.0
 Machine learning specialist                               199.0
 Mobile developer                                         1245.0
 Other                                                     276.0
 Quality assurance engineer                                221.0
 Systems administrator                                     905.0
Data scientist                                              60.0
Database administrator                                      23.0
Desktop app

In [22]:
df_dummies = pd.get_dummies(df_exploded[['HomeRemote', 'Country', 'YearsCodedJobPast', 'JobSatisfaction', 'Salary', 'DeveloperType']])

X = df_dummies.drop(['HomeRemote_Always remote'], axis=1)
y = df_dummies['HomeRemote_Always remote']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 87.54%


In [21]:
coefs_df = pd.DataFrame()
coefs_df['est_int'] = X_train.columns
coefs_df['coefs'] = model.coef_.reshape(257,1)
coefs_df['abs_coefs'] = np.abs(model.coef_.reshape(257,1))
coefs_df = coefs_df.sort_values('abs_coefs', ascending=False)

coefs_df.head()

Unnamed: 0,est_int,coefs,abs_coefs
0,JobSatisfaction,-0.214882,0.214882
2,HomeRemote_A few days each month,-0.02503,0.02503
7,HomeRemote_Never,-0.022116,0.022116
256,DeveloperType_Web developer,-0.012193,0.012193
5,"HomeRemote_Less than half the time, but at lea...",-0.00716,0.00716


After all analysis, I discover that the reason why people prefer to work remotely is job satisfaction.