In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

from scipy.stats import zscore
import seaborn as sns

In [2]:
dataset = pd.read_csv("developer_productivity_dataset.csv")

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 13 columns):
Developer_ID                   200 non-null object
Week_Number                    200 non-null int64
Experience_Years               200 non-null float64
Role_Level                     200 non-null object
Programming_Language           200 non-null object
Remote_Work                    200 non-null bool
Hours_Worked                   200 non-null float64
Tasks_Completed                200 non-null int64
Code_Review_Score              200 non-null float64
Project_Familiarity_Percent    200 non-null int64
Collaboration_Score            200 non-null float64
Meetings_Attended              200 non-null int64
Productivity_Score             200 non-null float64
dtypes: bool(1), float64(5), int64(4), object(3)
memory usage: 19.1+ KB


In [4]:
#Check the dataset has Null values
dataset.isnull().sum()

Developer_ID                   0
Week_Number                    0
Experience_Years               0
Role_Level                     0
Programming_Language           0
Remote_Work                    0
Hours_Worked                   0
Tasks_Completed                0
Code_Review_Score              0
Project_Familiarity_Percent    0
Collaboration_Score            0
Meetings_Attended              0
Productivity_Score             0
dtype: int64

In [5]:
dataset

Unnamed: 0,Developer_ID,Week_Number,Experience_Years,Role_Level,Programming_Language,Remote_Work,Hours_Worked,Tasks_Completed,Code_Review_Score,Project_Familiarity_Percent,Collaboration_Score,Meetings_Attended,Productivity_Score
0,D0039,15,4.2,Senior,C++,False,52.8,7,7.4,52,5.2,1,56.999528
1,D0029,32,4.8,Junior,C++,False,34.5,15,8.1,61,7.2,9,77.129808
2,D0015,32,5.2,Mid,Python,True,46.9,19,6.6,58,4.6,4,74.278398
3,D0043,24,12.8,Junior,JavaScript,True,26.7,10,7.4,73,6.7,3,66.969578
4,D0008,41,2.5,Senior,JavaScript,False,29.6,15,8.7,53,8.5,7,83.842350
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,D0037,32,8.8,Junior,JavaScript,False,35.2,16,5.4,44,4.7,9,65.830453
196,D0033,47,11.2,Senior,Java,True,35.8,13,9.8,52,7.7,3,79.383883
197,D0042,22,11.6,Senior,C++,True,40.0,14,8.3,68,8.6,6,73.092362
198,D0044,23,12.4,Senior,Go,True,43.5,14,6.9,70,7.9,6,76.407934


In [6]:
# Convert Developer_id column from string to int
dataset['Developer_ID'] = dataset['Developer_ID'].str.extract('(\d+)').astype('int64')

In [7]:
# 1. Label Encoding for binary categorical feature (Remote_Work)
label_encoder = LabelEncoder()
dataset["Remote_Work"] = label_encoder.fit_transform(dataset["Remote_Work"])

In [8]:
dataset = pd.get_dummies(dataset,columns=['Role_Level'])
dataset = pd.get_dummies(dataset,columns=['Programming_Language'])

In [9]:
dataset

Unnamed: 0,Developer_ID,Week_Number,Experience_Years,Remote_Work,Hours_Worked,Tasks_Completed,Code_Review_Score,Project_Familiarity_Percent,Collaboration_Score,Meetings_Attended,Productivity_Score,Role_Level_Junior,Role_Level_Mid,Role_Level_Senior,Programming_Language_C++,Programming_Language_Go,Programming_Language_Java,Programming_Language_JavaScript,Programming_Language_Python
0,39,15,4.2,0,52.8,7,7.4,52,5.2,1,56.999528,0,0,1,1,0,0,0,0
1,29,32,4.8,0,34.5,15,8.1,61,7.2,9,77.129808,1,0,0,1,0,0,0,0
2,15,32,5.2,1,46.9,19,6.6,58,4.6,4,74.278398,0,1,0,0,0,0,0,1
3,43,24,12.8,1,26.7,10,7.4,73,6.7,3,66.969578,1,0,0,0,0,0,1,0
4,8,41,2.5,0,29.6,15,8.7,53,8.5,7,83.842350,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,37,32,8.8,0,35.2,16,5.4,44,4.7,9,65.830453,1,0,0,0,0,0,1,0
196,33,47,11.2,1,35.8,13,9.8,52,7.7,3,79.383883,0,0,1,0,0,1,0,0
197,42,22,11.6,1,40.0,14,8.3,68,8.6,6,73.092362,0,0,1,1,0,0,0,0
198,44,23,12.4,1,43.5,14,6.9,70,7.9,6,76.407934,0,0,1,0,1,0,0,0


In [10]:
numeric_cols = dataset.select_dtypes(include=['float64', 'int64']).columns

In [11]:
numeric_cols


Index(['Developer_ID', 'Week_Number', 'Experience_Years', 'Remote_Work',
       'Hours_Worked', 'Tasks_Completed', 'Code_Review_Score',
       'Project_Familiarity_Percent', 'Collaboration_Score',
       'Meetings_Attended', 'Productivity_Score'],
      dtype='object')

In [12]:
independent = dataset.drop("Productivity_Score",axis=1)
dependent = dataset['Productivity_Score']

In [13]:
x_train,x_test,y_train,y_test=train_test_split(independent,dependent,test_size=0.30,random_state=0)

In [14]:
x_train

Unnamed: 0,Developer_ID,Week_Number,Experience_Years,Remote_Work,Hours_Worked,Tasks_Completed,Code_Review_Score,Project_Familiarity_Percent,Collaboration_Score,Meetings_Attended,Role_Level_Junior,Role_Level_Mid,Role_Level_Senior,Programming_Language_C++,Programming_Language_Go,Programming_Language_Java,Programming_Language_JavaScript,Programming_Language_Python
131,27,8,8.9,1,40.7,4,9.3,78,6.9,9,0,1,0,0,1,0,0,0
96,45,16,12.1,1,49.9,18,6.1,88,5.8,3,1,0,0,1,0,0,0,0
181,28,21,13.3,0,36.5,9,9.1,83,5.1,2,0,1,0,0,0,0,0,1
19,44,2,0.9,1,40.1,13,9.2,69,8.5,1,0,0,1,0,0,0,0,1
153,49,33,1.8,1,36.0,3,9.6,98,9.0,6,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,42,40,8.7,0,42.6,6,6.2,63,4.9,7,0,1,0,0,0,0,0,1
192,8,46,6.8,1,31.3,5,9.1,83,7.7,4,0,0,1,0,0,0,1,0
117,7,46,7.7,1,39.7,9,8.7,96,4.1,6,1,0,0,0,1,0,0,0
47,50,9,6.8,0,37.3,5,8.2,93,8.8,4,1,0,0,0,0,1,0,0


In [15]:
#sc = StandardScaler()
#train_scaler = sc.fit_transform(x_train[numeric_cols])
#test_scaler = sc.transform(x_test[numeric_cols])

#sc = StandardScaler()
#train_scaler = sc.fit_transform(x_train)
#test_scaler = sc.transform(x_test)

In [16]:
dataset

Unnamed: 0,Developer_ID,Week_Number,Experience_Years,Remote_Work,Hours_Worked,Tasks_Completed,Code_Review_Score,Project_Familiarity_Percent,Collaboration_Score,Meetings_Attended,Productivity_Score,Role_Level_Junior,Role_Level_Mid,Role_Level_Senior,Programming_Language_C++,Programming_Language_Go,Programming_Language_Java,Programming_Language_JavaScript,Programming_Language_Python
0,39,15,4.2,0,52.8,7,7.4,52,5.2,1,56.999528,0,0,1,1,0,0,0,0
1,29,32,4.8,0,34.5,15,8.1,61,7.2,9,77.129808,1,0,0,1,0,0,0,0
2,15,32,5.2,1,46.9,19,6.6,58,4.6,4,74.278398,0,1,0,0,0,0,0,1
3,43,24,12.8,1,26.7,10,7.4,73,6.7,3,66.969578,1,0,0,0,0,0,1,0
4,8,41,2.5,0,29.6,15,8.7,53,8.5,7,83.842350,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,37,32,8.8,0,35.2,16,5.4,44,4.7,9,65.830453,1,0,0,0,0,0,1,0
196,33,47,11.2,1,35.8,13,9.8,52,7.7,3,79.383883,0,0,1,0,0,1,0,0
197,42,22,11.6,1,40.0,14,8.3,68,8.6,6,73.092362,0,0,1,1,0,0,0,0
198,44,23,12.4,1,43.5,14,6.9,70,7.9,6,76.407934,0,0,1,0,1,0,0,0


In [17]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 19 columns):
Developer_ID                       200 non-null int64
Week_Number                        200 non-null int64
Experience_Years                   200 non-null float64
Remote_Work                        200 non-null int64
Hours_Worked                       200 non-null float64
Tasks_Completed                    200 non-null int64
Code_Review_Score                  200 non-null float64
Project_Familiarity_Percent        200 non-null int64
Collaboration_Score                200 non-null float64
Meetings_Attended                  200 non-null int64
Productivity_Score                 200 non-null float64
Role_Level_Junior                  200 non-null uint8
Role_Level_Mid                     200 non-null uint8
Role_Level_Senior                  200 non-null uint8
Programming_Language_C++           200 non-null uint8
Programming_Language_Go            200 non-null uint8
Programming_Language_

In [18]:
dataset.to_csv("PreProcessedDeveloperProductivityDataset.csv",index=False)