In [None]:
#1.Debug the given code

import pandas as pd

data = {'Feature1': ['10', '20', 'Thirty'],  # 'Thirty' is not a valid number
        'Feature2': [5.5, 6.7, 8.9]}

df = pd.DataFrame(data)
df['Feature1'] = df['Feature1'].astype(int)  # Error: Cannot convert 'Thirty' to int

print(df)


In [1]:
#1. Correct Code
import pandas as pd

data = {'Feature1': ['10', '20', 'Thirty'],  # Change 'Thirty' into 30
        'Feature2': [5.5, 6.7, 8.9]}

df = pd.DataFrame(data)
#df['Feature1'] = df['Feature1'].astype(int) # Error: Cannot convert 'Thirty' to int

#Using the to_numeric function to convert non-numeric data into numerical data
df['Feature1'] = pd.to_numeric(df['Feature1'], errors='coerce') # Shows the invalid values as NaN(Not a Number)
print(df)

   Feature1  Feature2
0      10.0       5.5
1      20.0       6.7
2       NaN       8.9


In [None]:
#2.Debug the Code

import pandas as pd

data = {'A': [1, 2, None],  # Missing value
        'B': [4, None, 6]}  # Missing value

df = pd.DataFrame(data)
mean_value = df.mean()
df.fillna(mean_value)  # Error: fillna() does not modify in place

print(df)


In [2]:
#2.Correct Code

import pandas as pd

data = {'A': [1, 2, None],  
        'B': [4, None, 6]}

df = pd.DataFrame(data)
mean_value = df.mean()
df.fillna(mean_value)  # Error: fillna() does not modify in place
df.fillna(df.mean(), inplace=True)  # Use inplace=True to modify the DataFrame
print(df)

     A    B
0  1.0  4.0
1  2.0  5.0
2  1.5  6.0


In [None]:
#3. Debug the Code
from sklearn.linear_model import LinearRegression
import numpy as np

X = np.array([1, 2, 3, 4, 5])  # Error: X should be 2D
y = np.array([2, 4, 6, 8, 10])

model = LinearRegression()
model.fit(X, y)

In [6]:
#3. Correct Code
from sklearn.linear_model import LinearRegression
import numpy as np

X = np.array([1, 2, 3, 4, 5]).reshape(-1, 1)  # Error: X should be 2D, Changing 1D into 2d using .reshape(-1,1)
y = np.array([2, 4, 6, 8, 10])

model = LinearRegression()
model.fit(X, y)

print(model.predict([[6]]))

[12.]


In [None]:
#4. Debug the Code
from sklearn.preprocessing import StandardScaler
import numpy as np

data = np.array([10, 20, 30, 40, 50])  # Error: Should be 2D

scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

print(scaled_data)

In [7]:
# 4. Correct Code
from sklearn.preprocessing import StandardScaler
import numpy as np

data = np.array([10, 20, 30, 40, 50]).reshape(-1,1)  # Error: Should be 2D

scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

print(scaled_data)

[[-1.41421356]
 [-0.70710678]
 [ 0.        ]
 [ 0.70710678]
 [ 1.41421356]]


In [None]:
#5. Debug the Code
from sklearn.linear_model import LogisticRegression

X = [[1, 2], [3, 4], [5, 6]]
y = ['yes', 'no', 'yes']  # Error: Labels should be numeric

model = LogisticRegression()
model.fit(X, y)

In [8]:
#5. Correct Code

from sklearn.linear_model import LogisticRegression

X = [[1, 2], [3, 4], [5, 6]]
y = [1, 0, 1]  # Convert labels to numeric (Labels must be 0 and 1 for classification.)

model = LogisticRegression()
model.fit(X, y)

print(model.predict([[4, 5]]))

[1]


In [None]:
#6. Debug the Code
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

df = pd.DataFrame({'Category': ['A', 'B', 'C', 'A']})

encoder = OneHotEncoder
encoded = encoder.fit_transform(df['Category'])  # Error: Data should be reshaped
print(encoded)

In [9]:
#6. Correct Code
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

df = pd.DataFrame({'Category': ['A', 'B', 'C', 'A']})

encoder = OneHotEncoder()
encoded = encoder.fit_transform(df[['Category']])  # Error: Data should be reshaped
print(encoded)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4 stored elements and shape (4, 3)>
  Coords	Values
  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 0)	1.0


In [None]:
#6. Debug the Code
from sklearn.model_selection import train_test_split

X = [[1, 2], [3, 4], [5, 6], [7, 8]]
y = [0, 1, 0] #4th sample is missing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
#6. Correct Code
from sklearn.model_selection import train_test_split

X = [[1, 2], [3, 4], [5, 6], [7, 8]]
y = [0, 1, 0, 2] #Add the 4th Sample

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print("X_train:", X_train)
print("y_train:", y_train)

X_train: [[7, 8], [5, 6], [3, 4]]
y_train: [2, 0, 1]


In [None]:
#7.from sklearn.linear_model import LogisticRegression

X_train = [[1, 2], [3, 4], [5, 6]]
y_train = ["yes", "no", "yes"]  # Error: LogisticRegression expects numerical labels

model = LogisticRegression()
model.fit(X_train, y_train)

In [11]:
#7. Correct Code
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

X_train = [[1, 2], [3, 4], [5, 6]]
y_train = ["yes", "no", "yes"]

#Logistic Regression requires numeric labels, using LabelEncoder()
encoder = LabelEncoder() 
y_train_encoded = encoder.fit_transform(y_train)  # Convert labels to numeric

model = LogisticRegression()
model.fit(X_train, y_train_encoded)
answer = model.predict([[2,3]])
result = int(answer[0])
print(result)
#print(model.predict([[2, 3]]))

1


In [None]:
#8. Debug the code
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Creating a dataset with missing values
X_train = np.array([[1, 2], [3, np.nan], [5, 6]])
y_train = np.array([10, 20, 30])

model = LinearRegression()
model.fit(X_train, y_train)  


In [12]:
#8. Correct Code
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

# Given dataset is with missing values
X_train = np.array([[1, 2], [3, np.nan], [5, 6]])
y_train = np.array([10, 20, 30])

# Missing values by replacing with NaN with the column mean using SimpleImputer
imputer = SimpleImputer(strategy="mean")
X_train_imputed = imputer.fit_transform(X_train)

# Train the model
model = LinearRegression()
model.fit(X_train_imputed, y_train)

# Make a prediction
prediction = model.predict([[3, 4]])
answer = int(prediction[0])
print("Predicted Output:", answer)


Predicted Output: 20


In [None]:
#9.
from sklearn.tree import DecisionTreeClassifier

X_train = [[1, 2], [3, 4], [5, 6]]
y_train = ["spam", "ham", "spam"]  # Error: String labels not allowed

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [13]:
#9.Correct Code
from sklearn.tree import DecisionTreeClassifier

X_train = [[1, 2], [3, 4], [5, 6]]
y_train = ["spam", "ham", "spam"]  # Error: String labels not allowed

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)  # Convert to numbers

model.fit(X_train, y_train_encoded)
print(model.predict([[2, 3]]))  # Output: [0] or [1]


model = DecisionTreeClassifier()
model.fit(X_train, y_train)

[0.66666667]


In [None]:
#10.
from sklearn.svm import SVC

X_train = [[1, 100], [2, 200], [3, 300]]
y_train = [0, 1, 0]

model = SVC()
model.fit(X_train, y_train)
print(model.predict([[1, 150]]))  # Unreliable output due to large-scale difference

In [14]:
#Correct Code

#SVM is sensitive to feature scales. StandardScaler normalizes the features, improving model performance.
    
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

X_train = [[1, 100], [2, 200], [3, 300]]
y_train = [0, 1, 0]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform([[1, 150]])
print(model.predict(X_test_scaled))  # More reliable output

[0]
