### 6

 21: Pandas DataFrame - CSV Import with apply() and map() Transformations

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("data.csv")

# Using apply() and map()
# Apply: Increase salary by 5%
df['salary_adjusted'] = df['salary'].apply(lambda x: x * 1.05)

# Create category for map() demonstration
df['experience_level'] = df['experience'].apply(lambda x: 'senior' if x > 10 else 'junior')
df['level_upper'] = df['experience_level'].map(lambda x: x.upper())

print(df.head())


22: Data Preprocessing - Missing Data, Outliers, and Standardization

In [None]:
from sklearn.preprocessing import StandardScaler

# Fill missing values with mean
df.fillna(df.mean(numeric_only=True), inplace=True)

# Detect outliers (using salary)
Q1 = df['salary'].quantile(0.25)
Q3 = df['salary'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR


# Standardization
scaler = StandardScaler()
df[['age', 'salary']] = scaler.fit_transform(df[['age', 'salary']])

print(df.head())


 23: Data Visualization - Line Plot and Scatter Plot with Regression Line

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Scatter plot with regression line
sns.lmplot(x='experience', y='salary', data=df)
plt.title("Experience vs Salary Trend")
plt.show()

# Line plot showing trend
df.sort_values('experience')[['experience', 'salary']].plot(x='experience', y='salary', kind='line')
plt.title("Salary Growth over Experience")
plt.show()


24: K-Nearest Neighbors (KNN) Classifier - Compare Different k-values

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Prepare data for classification
# Using 'age' and 'experience' to predict if salary is high (above median)
X = df[['age', 'experience']]
y = (df['salary'] > df['salary'].median()).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

accuracies = []
k_values = range(1, 11)

for k in k_values:
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_pred))

plt.plot(k_values, accuracies, marker='o')
plt.title("KNN Accuracy vs K Values")
plt.xlabel("K")
plt.ylabel("Accuracy")
plt.show()
