In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
def test_model(model, data):
    X_train_scaled, X_test_scaled, y_train, y_test = data
    cls = model.fit(X_train_scaled, y_train)
    print(f'Model: {type(cls).__name__}')
    print(f'Train score: {cls.score(X_train_scaled, y_train)}')
    print(f'Test Score: {cls.score(X_test_scaled, y_test)}\n')

In [3]:
csv_file = "garments_worker_productivity.csv"

# Read the CSV file into a Pandas DataFrame
data_df = pd.read_csv(csv_file)

# Review the DataFrame
data_df.head()

Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
0,1/1/2015,Quarter1,sweing,Thursday,8,0.8,26.16,1108.0,7080,98,0.0,0,0,59.0,0.940725
1,1/1/2015,Quarter1,finishing,Thursday,1,0.75,3.94,,960,0,0.0,0,0,8.0,0.8865
2,1/1/2015,Quarter1,sweing,Thursday,11,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
3,1/1/2015,Quarter1,sweing,Thursday,12,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
4,1/1/2015,Quarter1,sweing,Thursday,6,0.8,25.9,1170.0,1920,50,0.0,0,0,56.0,0.800382


## Preprocess the data

In [5]:
mean_wip = data_df['wip'].mean()
mean_wip

1190.4659913169319

In [12]:
data_df.fillna({"wip": mean_wip}, inplace=True)
data_df

Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
0,1/1/2015,Quarter1,sweing,Thursday,8,0.80,26.16,1108.000000,7080,98,0.0,0,0,59.0,0.940725
1,1/1/2015,Quarter1,finishing,Thursday,1,0.75,3.94,1190.465991,960,0,0.0,0,0,8.0,0.886500
2,1/1/2015,Quarter1,sweing,Thursday,11,0.80,11.41,968.000000,3660,50,0.0,0,0,30.5,0.800570
3,1/1/2015,Quarter1,sweing,Thursday,12,0.80,11.41,968.000000,3660,50,0.0,0,0,30.5,0.800570
4,1/1/2015,Quarter1,sweing,Thursday,6,0.80,25.90,1170.000000,1920,50,0.0,0,0,56.0,0.800382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192,3/11/2015,Quarter2,finishing,Wednesday,10,0.75,2.90,1190.465991,960,0,0.0,0,0,8.0,0.628333
1193,3/11/2015,Quarter2,finishing,Wednesday,8,0.70,3.90,1190.465991,960,0,0.0,0,0,8.0,0.625625
1194,3/11/2015,Quarter2,finishing,Wednesday,7,0.65,3.90,1190.465991,960,0,0.0,0,0,8.0,0.625625
1195,3/11/2015,Quarter2,finishing,Wednesday,9,0.75,2.90,1190.465991,1800,0,0.0,0,0,15.0,0.505889


In [13]:
# Check the data types
data_df.dtypes

date                      object
quarter                   object
department                object
day                       object
team                       int64
targeted_productivity    float64
smv                      float64
wip                      float64
over_time                  int64
incentive                  int64
idle_time                float64
idle_men                   int64
no_of_style_change         int64
no_of_workers            float64
actual_productivity      float64
dtype: object

In [14]:
target_var = "targeted_productivity"
# Split the features and target data
# y = data_df[target_var].values.reshape(-1,1)
# X = data_df.drop(columns=target_var)

In [15]:
# replace with your categorical column names
categorical_cols = ['date','quarter','department','day']  

In [16]:
# Split data into training and testing datasets
X = data_df.drop(target_var, axis=1)  # replace with your target column
y = data_df[target_var]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Initialize OneHotEncoder
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

# Fit and transform
X_train_encoded = ohe.fit_transform(X_train[categorical_cols])
X_test_encoded = ohe.transform(X_test[categorical_cols])

# Create DataFrames
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=ohe.get_feature_names_out(categorical_cols))
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=ohe.get_feature_names_out(categorical_cols))

# Concatenate
X_train_final = pd.concat([X_train.drop(categorical_cols, axis=1).reset_index(drop=True), X_train_encoded_df], axis=1)
X_test_final = pd.concat([X_test.drop(categorical_cols, axis=1).reset_index(drop=True), X_test_encoded_df], axis=1)

## Check for Missing Values

In [18]:
# Find the percentage of null values in each column
X_train_final.isna().sum()/len(X_train_final)

team             0.0
smv              0.0
wip              0.0
over_time        0.0
incentive        0.0
                ... 
day_Saturday     0.0
day_Sunday       0.0
day_Thursday     0.0
day_Tuesday      0.0
day_Wednesday    0.0
Length: 79, dtype: float64

In [19]:
data = [X_train_final, X_test_final, y_train, y_test]

# Trying Different Models of Classification to Analyze the Data

In [21]:
def test_model(model, data):
    X_train_final, X_test_final, y_train, y_test = data
    cls = model.fit(X_train_final, y_train)
    print(f'Model: {type(cls).__name__}')
    print(f'Train score: {cls.score(X_train_final, y_train)}')
    print(f'Test Score: {cls.score(X_test_final, y_test)}\n')

In [22]:
test_model(SVC(kernel='linear'), data)
test_model(KNeighborsClassifier(n_neighbors=9), data)
test_model(DecisionTreeClassifier(), data)
test_model(RandomForestClassifier(n_estimators=128, random_state=1), data)

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:
# Try the following values for max_depth

max_depths = range(1, 10)
models = {'train_score': [], 'test_score': [], 'max_depth': []}

# Loop through each value in max_depths
for depth in max_depths:
    clf = RandomForestClassifier(max_depth = depth)
    clf.fit(X_train, y_train)

    train_pred = clf.predict(X_train)
    test_pred = clf.predict(X_test)

    train_score = balanced_accuracy_score(y_train, train_pred)
    test_score = balanced_accuracy_score(y_test, test_pred)

    models['train_score'].append(train_score)
    models['test_score'].append(test_score)
    models['max_depth'].append(depth)

# Create a dataframe from the models dictionary with max_depth as the index
models_df = pd.DataFrame(models).set_index('max_depth')

In [None]:
# Plot the results
models_df.plot()

In [None]:
# Pick the depth value from the plot above
depth = 
clf = RandomForestClassifier(max_depth=depth)
clf.fit(X_train, y_train) 

train_pred = clf.predict(X_train)
test_pred = clf.predict(X_test)

print('Random Forest Classifier')
print(f'Train score: {balanced_accuracy_score(y_train, train_pred)}')
print(f'Test Score: {balanced_accuracy_score(y_test, test_pred)}')

## Plots

In [None]:
# Seaborn Plots
sns.set()
plt.plot(x, y)
plt.legend('ABCDEF', ncol=2, loc='upper left');

In [None]:
# Histogram
data = np.random.multivariate_normal([0, 0], [[5, 2], [2, 2]], size=2000)
data = pd.DataFrame(data, columns=['x', 'y'])

for col in 'xy':
    plt.hist(data[col], normed=True, alpha=0.5)

In [None]:
for col in 'xy':
    sns.kdeplot(data[col], shade=True)

In [None]:
# Combine Historgram and KDE
sns.distplot(data['x'])
sns.distplot(data['y']);

In [None]:
# two-dimensional visualization of the data
sns.kdeplot(data);

In [None]:
# joint distribution and the marginal distributions
with sns.axes_style('white'):
    sns.jointplot("x", "y", data, kind='kde');

In [None]:
# hexagonally based histogram instead
with sns.axes_style('white'):
    sns.jointplot("x", "y", data, kind='hex')

# stuff below is extra old code

In [2]:
## Model and Fit to a Support Vector Machine

In [None]:
# Create the support vector machine classifier model with a 'linear' kernel
svm_model = SVC(kernel='linear')

# Fit the model to the training data
svm_model.fit(X_train_encoded, y_train)

In [None]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % svm_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % svm_model.score(X_test_encoded, y_test))

In [None]:
## Model and Fit to a KNN Model

In [None]:
# Create the KNN model with 9 neighbors
knn_model = KNeighborsClassifier(n_neighbors=9)

# Fit the model to the training data
knn_model.fit(X_train_encoded, y_train)

In [None]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % knn_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % knn_model.score(X_test_encoded, y_test))

In [None]:
## Model and Fit to a Decision Tree Classifier

In [None]:
# Create the decision tree classifier model
dt_model = DecisionTreeClassifier()

# Fit the model to the training data
dt_model.fit(X_train_encoded, y_train)

In [None]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % dt_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % dt_model.score(X_test_encoded, y_test))

In [None]:
## Model and Fit to a Random Forest Classifier

In [None]:
# Create the random forest classifier model
# with n_estimators=128 and random_state=1
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

# Fit the model to the training data
rf_model.fit(X_train_encoded, y_train)

In [None]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % rf_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % rf_model.score(X_test_encoded, y_test))

In [None]:
# Show all models' scores
print("Logistic Regression Model")
print('Train Accuracy: %.3f' % lr_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % lr_model.score(X_test_encoded, y_test))
print("----------------------------------")
print("Support Vector Machine Model")
print('Train Accuracy: %.3f' % svm_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % svm_model.score(X_test_encoded, y_test))
print("----------------------------------")
print("K Nearest Neighbor Model")
print('Train Accuracy: %.3f' % knn_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % knn_model.score(X_test_encoded, y_test))
print("----------------------------------")
print("Decision Tree Model")
print('Train Accuracy: %.3f' % dt_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % dt_model.score(X_test_encoded, y_test))
print("----------------------------------")
print("Random Forrest Model")
print('Train Accuracy: %.3f' % rf_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % rf_model.score(X_test_encoded, y_test))