In [35]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
import mysql.connector
import joblib

In [44]:

# Step 1: Load Datasets
print("Step 1: Loading datasets...")
city_df = pd.read_csv(r"E:\\Gopikaa\\GUVI\\Tourism Experience\\Files\\City.xlsx", na_values='-', encoding='latin1')
continent_df = pd.read_csv(r"E:\\Gopikaa\\GUVI\\Tourism Experience\\Files\\Continent.xlsx", encoding='latin1')
country_df = pd.read_csv(r"E:\\Gopikaa\\GUVI\\Tourism Experience\\Files\\Country.xlsx", encoding='latin1')
item_df = pd.read_csv(r"E:\\Gopikaa\\GUVI\\Tourism Experience\\Files\\Item.xlsx", encoding='latin1')
mode_df = pd.read_csv(r"E:\\Gopikaa\\GUVI\\Tourism Experience\\Files\\Mode.xlsx", encoding='latin1')
region_df = pd.read_csv(r"E:\\Gopikaa\\GUVI\\Tourism Experience\\Files\\Region.xlsx", encoding='latin1')
transaction_df = pd.read_csv(r"E:\\Gopikaa\\GUVI\\Tourism Experience\\Files\\Transaction.xlsx", encoding='latin1')
type_df = pd.read_csv(r"E:\\Gopikaa\\GUVI\\Tourism Experience\\Files\\Type.xlsx", encoding='latin1')
user_df = pd.read_csv(r"E:\\Gopikaa\\GUVI\\Tourism Experience\\Files\\User.xlsx", encoding='latin1')
print("Datasets loaded successfully.")



Step 1: Loading datasets...


ParserError: Error tokenizing data. C error: Expected 2 fields in line 3, saw 3


In [28]:
# Step 2: Merge Datasets
print("Step 2: Merging datasets...")
transaction_df['UserId'] = transaction_df['UserId'].astype(str)
user_df['UserId'] = user_df['UserId'].astype(str)
transaction_df['AttractionId'] = transaction_df['AttractionId'].astype(str)
type_df['AttractionId'] = type_df['AttractionId'].astype(str)
user_df['CityId'] = user_df['CityId'].astype(str)
city_df['CityId'] = city_df['CityId'].astype(str)
city_df['CountryId'] = city_df['CountryId'].astype(str)
country_df['CountryId'] = country_df['CountryId'].astype(str)
country_df['RegionId'] = country_df['RegionId'].astype(str)
region_df['RegionId'] = region_df['RegionId'].astype(str)
user_df['ContenentId'] = user_df['ContenentId'].astype(str)
continent_df['ContenentId'] = continent_df['ContenentId'].astype(str)

merged_df = pd.merge(transaction_df, user_df, on="UserId", how="inner")
merged_df = pd.merge(merged_df, type_df, on="AttractionId", how="inner")
merged_df = pd.merge(merged_df, city_df, on="CityId", how="inner")
merged_df = pd.merge(merged_df, country_df, on="CountryId", how="inner")
merged_df = pd.merge(merged_df, region_df, on="RegionId", how="inner")
merged_df = pd.merge(merged_df, continent_df, on="ContenentId", how="inner")
print("Datasets merged successfully.")
print(merged_df.head())


In [None]:
# Step 3: Clean Data
print("Step 3: Cleaning data...")
cleaned_df = merged_df.dropna()
print("Data cleaned successfully.")
print(cleaned_df.info())

In [None]:
# Step 4: Perform EDA
print("Step 4: Performing EDA...")
print("Dataset Information:")
print(cleaned_df.info())
print("\nDataset Description:")
print(cleaned_df.describe())
print("\nMissing Values:")
print(cleaned_df.isnull().sum())


In [None]:
# Step 5: Visualize Data
print("Step 5: Visualizing data...")
sns.countplot(x="VisitMode", data=cleaned_df)
plt.title("Visit Mode Distribution")
plt.show()

In [None]:
# Step 6: Save to SQL
print("Step 6: Saving to MySQL...")
conn = mysql.connector.connect(
    host="localhost",
    user="root",
    password="password",
    database="tourism_db"
)
cursor = conn.cursor()

cursor.execute("DROP TABLE IF EXISTS tourism_data")
create_table_query = '''
CREATE TABLE tourism_data (
    {});
'''.format(", ".join([f"{col} VARCHAR(255)" for col in cleaned_df.columns]))
cursor.execute(create_table_query)

for _, row in cleaned_df.iterrows():
    insert_query = "INSERT INTO tourism_data ({}) VALUES ({});".format(
        ", ".join(cleaned_df.columns), ", ".join(["%s"] * len(cleaned_df.columns)))
    cursor.execute(insert_query, tuple(row))

conn.commit()
conn.close()
print("Data saved to MySQL successfully.")



In [None]:
# Step 7: Train Machine Learning Models
print("Step 7: Training ML models...")
X = cleaned_df[["VisitYear", "VisitMonth", "AttractionTypeId"]]
y_reg = cleaned_df["Rating"]
y_clf = cleaned_df["VisitMode"]

X_train, X_test, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)
_, _, y_train_clf, y_test_clf = train_test_split(X, y_clf, test_size=0.2, random_state=42)

regressor = LinearRegression()
regressor.fit(X_train, y_train_reg)
y_pred_reg = regressor.predict(X_test)
print("Regression MSE:", mean_squared_error(y_test_reg, y_pred_reg))

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train_clf)
y_pred_clf = classifier.predict(X_test)
print("Classification Accuracy:", accuracy_score(y_test_clf, y_pred_clf))

joblib.dump(regressor, "regressor_model.pkl")
joblib.dump(classifier, "classifier_model.pkl")
print("Models trained and saved successfully.")