# Experiment 1: Data Preprocessing – Handling Missing Values

In [None]:
from io import StringIO
import pandas as pd

data_text = '''Country,Age,Salary,Purchased
France,44,72000,No
Spain,27,48000,Yes
Germany,30,54000,No
Spain,38,61000,No
Germany,40,,Yes
France,35,58000,Yes
Spain,,52000,No
France,48,79000,Yes
Germany,50,83000,No
France,37,67000,Yes
'''
data = pd.read_csv(StringIO(data_text))
print("Original Data:")
print(data)
data['Age'].fillna(data['Age'].mean(), inplace=True)
data['Salary'].fillna(data['Salary'].mean(), inplace=True)
print("After Filling Missing Values:")
print(data)

# Experiment 2: Label Encoding and One Hot Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

le = LabelEncoder()
data['Country'] = le.fit_transform(data['Country'])
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
data_encoded = ct.fit_transform(data)
print("Encoded Data:")
print(data_encoded)

# Experiment 3: Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

data_scaled = pd.DataFrame(data_encoded).copy()
sc = StandardScaler()
data_scaled.iloc[:, -2:] = sc.fit_transform(data_scaled.iloc[:, -2:])
print("Scaled Data:")
print(data_scaled)

# Experiment 4: Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

salary_data = '''YearsExperience,Salary
1.1,39343
1.3,46205
1.5,37731
2.0,43525
2.2,39891
2.9,56642
3.0,60150
3.2,54445
3.2,64445
3.7,57189
'''
df_salary = pd.read_csv(StringIO(salary_data))
X = df_salary[['YearsExperience']]
y = df_salary['Salary']
model = LinearRegression()
model.fit(X, y)
plt.scatter(X, y, color='red')
plt.plot(X, model.predict(X), color='blue')
plt.title("Salary vs Experience")
plt.xlabel("Years of Experience")
plt.ylabel("Salary")
plt.show()

# Experiment 5: Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

ads_data = '''User ID,Gender,Age,EstimatedSalary,Purchased
15624510,Male,19,19000,0
15810944,Male,35,20000,0
15668575,Female,26,43000,1
15603246,Female,27,57000,1
15804002,Male,19,76000,0
'''
df_ads = pd.read_csv(StringIO(ads_data))
X = df_ads.iloc[:, [2, 3]].values
y = df_ads.iloc[:, 4].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Experiment 6: Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_iris

iris = load_iris(as_frame=True)
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred))

# Experiment 7: NLP Preprocessing

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

para = "Rajgad is a hill fort in Maharashtra. It was the capital of the Maratha Empire."
tokens = word_tokenize(para)
sentences = sent_tokenize(para)
filtered = [w for w in tokens if w.lower() not in stopwords.words('english')]
ps = PorterStemmer()
stemmed = [ps.stem(w) for w in filtered]
wnl = WordNetLemmatizer()
lemmatized = [wnl.lemmatize(w, pos='v') for w in filtered]
tags = pos_tag(tokens)
print("Tokens:", tokens)
print("Sentences:", sentences)
print("Filtered:", filtered)
print("Stemmed:", stemmed)
print("Lemmatized:", lemmatized)
print("POS Tags:", tags)

# Experiment 8: Titanic Dataset - Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

df_titanic = sns.load_dataset('titanic')
sns.jointplot(x='age', y='fare', data=df_titanic)
plt.show()

# Experiment 9: Titanic Dataset - Boxplot

In [None]:
sns.boxplot(x='sex', y='age', data=df_titanic)
sns.boxplot(x='sex', y='age', hue='survived', data=df_titanic)
plt.show()

# Experiment 10: Iris Dataset - Visualization

In [None]:
df_iris = sns.load_dataset('iris')
sns.pairplot(df_iris, hue='species')
sns.histplot(df_iris['sepal_length'], kde=True)
sns.boxplot(x=df_iris['petal_length'])
plt.show()