In [54]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score


In [55]:
def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.2, random_state = 0)
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)    
        return X_train, X_test, y_train, y_test

In [56]:
# Convert to tuple sets to compare row-wise equality
train_rows = set([tuple(row) for row in X_train])
test_rows = set([tuple(row) for row in X_test])

# Find intersection
overlap_count = len(train_rows.intersection(test_rows))
print("Overlapping rows between train and test:", overlap_count)

Overlapping rows between train and test: 41


In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1. Load data
df = pd.read_csv("Preprocessed_productivity_data.csv")
df = df.drop(columns=["Developer_ID"], errors='ignore')

# 2. Separate features and target
X = df.drop(columns=["Productivity_Score"])
y = df["Productivity_Score"]

# 3. Combine X and y for full row integrity during splitting
df_full = X.copy()
df_full["Productivity_Score"] = y

# 4. Shuffle and drop duplicates
df_full = df_full.drop_duplicates().sample(frac=1, random_state=42).reset_index(drop=True)

# 5. Custom split: 80% train, 20% test with no overlapping rows
train_size = int(len(df_full) * 0.8)
df_train = df_full.iloc[:train_size].reset_index(drop=True)
df_test = df_full.iloc[train_size:].reset_index(drop=True)

# 6. Split features and target again
X_train = df_train.drop(columns=["Productivity_Score"])
y_train = df_train["Productivity_Score"]
X_test = df_test.drop(columns=["Productivity_Score"])
y_test = df_test["Productivity_Score"]

# 7. Scale features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Check overlap again (should be zero)
train_rows = set([tuple(row) for row in X_train_scaled.values])
test_rows = set([tuple(row) for row in X_test_scaled.values])
print("Overlapping rows after fix:", len(train_rows.intersection(test_rows)))


Overlapping rows after fix: 0


In [58]:
def r2_prediction(regressor,test_rows,y_test):
    y_pred = regressor.predict(test_rows)
    r2=r2_score(y_test,y_pred)
    return r2

In [59]:
def selectkbest(indep_X,dep_Y,n):
        test = SelectKBest(score_func=chi2, k=n)
        fit1= test.fit(indep_X,dep_Y)
        # summarize scores       
        selectk_features = fit1.transform(indep_X)
        return selectk_features

In [60]:
def Linear(train_rows,y_train,test_rows):       
        regressor = LinearRegression()
        regressor.fit(train_rows, y_train)
        r2=r2_prediction(regressor,test_rows,y_test)
        return  r2 

In [61]:
def Svm_linear(train_rows,y_train,test_rows):       
        # Fitting svm_linear to the Training set
        
        regressor = SVR(kernel='linear')
        regressor.fit(train_rows, y_train)
        r2=r2_prediction(regressor,test_rows,y_test)
        return  r2 

In [62]:
def Svm_nonlinear(train_rows,y_train,test_rows):       
        # Fitting svm_nonlinear to the Training set
        
        regressor = SVR(kernel='rbf')
        regressor.fit(train_rows, y_train)
        r2=r2_prediction(regressor,test_rows,y_test)
        return  r2 

In [63]:
def Decision(train_rows,y_train,test_rows):       
        regressor = DecisionTreeRegressor(random_state=0)
        regressor.fit(train_rows, y_train)
        r2=r2_prediction(regressor,test_rows,y_test)
        return  r2 

In [64]:
def Random(train_rows,y_train,test_rows):       
        regressor = RandomForestRegressor(n_estimators=10,random_state=0)
        regressor.fit(train_rows, y_train)
        r2=r2_prediction(regressor,test_rows,y_test)
        return  r2 

In [65]:
def Selectk_regression(acclin,accsvml,accsvmnl,accdes,accrf): 
    
    dataframe=pd.DataFrame(index=['ChiSquare'],columns=['Linear','SVMl','SVMnl','Decision','Random'])

    for number,idex in enumerate(dataframe.index):
        
        dataframe['Linear'][idex]=acclin[number]       
        dataframe['SVMl'][idex]=accsvml[number]
        dataframe['SVMnl'][idex]=accsvmnl[number]
        dataframe['Decision'][idex]=accdes[number]
        dataframe['Random'][idex]=accrf[number]
    return dataframe.describe()

In [66]:
def rfeFeature(train_rows,dep_Y,n):
    
        rfelist=[]
        
        lin = LinearRegression()
        
        SVRl = SVR(kernel = 'linear')
        #SVRnl = SVR(kernel = 'rbf')
        dec = DecisionTreeRegressor(random_state = 0)
        
        rf = RandomForestRegressor(n_estimators = 10, random_state = 0)
        
        
        rfemodellist=[lin,SVRl,dec,rf] 
        for i in   rfemodellist:
            print(i)
            log_rfe = RFE(estimator=i, n_features_to_select=n)
            log_fit = log_rfe.fit(train_rows, dep_Y)
            log_rfe_feature=log_fit.transform(train_rows)
            rfelist.append(log_rfe_feature)
        return rfelist
    

In [67]:

def rfe_regression(acclog,accsvml,accdes,accrf): 
    
    rfedataframe=pd.DataFrame(index=['Linear','SVC','Random','DecisionTree'],columns=['Linear','SVMl',
                                                                                        'Decision','Random'])

    for number,idex in enumerate(rfedataframe.index):
        
        rfedataframe['Linear'][idex]=acclog[number]       
        rfedataframe['SVMl'][idex]=accsvml[number]
        rfedataframe['Decision'][idex]=accdes[number]
        rfedataframe['Random'][idex]=accrf[number]
    return rfedataframe

In [68]:
dataset1=pd.read_csv("Preprocessed_productivity_data.csv",index_col=None)

In [69]:
df2 = dataset1

In [70]:
df2= pd.get_dummies(df2,drop_first=True)

In [71]:
df2

Unnamed: 0,Developer_ID,Experience_Years,Avg_Tasks_Per_Week,Avg_Bugs_Per_Week,Commits_Per_Week,Meetings_Per_Week,Code_Review_Score,Project_Familiarity_Percent,Bug_Fix_Success_Rate,Overtime_Hours_Per_Week,Team_Size,Productivity_Score,Role_Level_Junior,Role_Level_Mid,Role_Level_Senior
0,1,7,15,4,38,4,9,88,75,4,7,100,0,0,1
1,2,4,3,1,28,6,6,54,75,0,11,100,1,0,0
2,3,8,18,3,26,4,8,50,90,9,14,100,0,1,0
3,4,5,9,1,9,6,10,62,84,6,7,100,0,0,1
4,5,7,7,4,37,7,7,70,72,9,8,100,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370,396,1,13,3,16,4,10,56,99,8,11,100,1,0,0
371,397,2,5,0,17,3,8,84,78,5,5,100,1,0,0
372,398,6,3,2,29,7,5,77,71,4,14,100,1,0,0
373,399,9,8,2,23,4,10,63,67,8,5,100,0,1,0


In [72]:
indep_X = df2.drop('Productivity_Score', 1)
dep_Y = df2['Productivity_Score']

In [73]:
kbest=selectkbest(indep_X,dep_Y,5)

In [74]:
acclin=[]
accsvml=[]
accsvmnl=[]
accdes=[]
accrf=[]

X_train, X_test, y_train, y_test=split_scalar(kbest,dep_Y)  
for i in kbest:  
    
    r2_lin=Linear(X_train,y_train,X_test)
    acclin.append(r2_lin)
    
    r2_sl=Svm_linear(X_train,y_train,X_test)    
    accsvml.append(r2_sl)
    
    r2_nl=Svm_nonlinear(X_train,y_train,X_test)
    accsvmnl.append(r2_nl)
    
    r2_d=Decision(X_train,y_train,X_test)
    accdes.append(r2_d)
    
    r2_r=Random(X_train,y_train,X_test)
    accrf.append(r2_r)
    
    
result = Selectk_regression(acclin,accsvml,accsvmnl,accdes,accrf)

In [75]:
result

Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
count,1.0,1.0,1.0,1.0,1.0
unique,1.0,1.0,1.0,1.0,1.0
top,1.0,1.0,1.0,1.0,1.0
freq,1.0,1.0,1.0,1.0,1.0


In [76]:
rfelist = rfeFeature(indep_X,dep_Y,4)

LinearRegression()
SVR(kernel='linear')
DecisionTreeRegressor(random_state=0)
RandomForestRegressor(n_estimators=10, random_state=0)


In [77]:
Racclin=[]
accsvml=[]
accsvmnl=[]
accdes=[]
accrf=[]

for i in rfelist:   
    X_train, X_test, y_train, y_test=split_scalar(i,dep_Y)
    
    r2_lin=Linear(X_train,y_train,X_test)
    acclin.append(r2_lin)
    
    r2_sl=Svm_linear(X_train,y_train,X_test)    
    accsvml.append(r2_sl)
    
    #r2_NL=Svm_NL(X_train,y_train,X_test)
    #accsvmnl.append(r2_NL)
    
    r2_d=Decision(X_train,y_train,X_test)
    accdes.append(r2_d)
    
    r2_r=Random(X_train,y_train,X_test)
    accrf.append(r2_r)
    
    
result=rfe_regression(acclin,accsvml,accdes,accrf)

In [78]:
result

Unnamed: 0,Linear,SVMl,Decision,Random
Linear,1,1,1,1
SVC,1,1,1,1
Random,1,1,1,1
DecisionTree,1,1,1,1
