In [None]:
import pandas as pd

# Load CSV file into DataFrame
df_tripadvisor = pd.read_csv('final_review_data.csv') 
df_tripadvisor.head()

In [None]:
df_raw_tourism = pd.read_excel(open('data/tourism-data.xlsx', 'rb'), sheet_name='Data') 
df_raw_tourism['Month'] = pd.to_datetime(df_raw_tourism['Month'])
df_raw_tourism['Month'] = df_raw_tourism['Month'].dt.strftime('%Y-%m')
df_tourism = df_raw_tourism.groupby('Month').agg({
    'Total tourist arrivals': 'sum'
}).reset_index()
df_tourism.head()

In [None]:
df_tripadvisor.head()

In [None]:
# Group by publishedDate by calculating average rating and combining all the text
df_tripadvisor = df_tripadvisor.groupby('travelDate').agg({
    'sentimentPolarity': 'mean'
}).reset_index()

# Rename columns
df_tripadvisor = df_tripadvisor.rename(columns={'sentimentPolarity': 'average_sentimentPolarity'})

In [None]:
df_tourism_final = df_tourism.groupby('Month').agg({
    'Total tourist arrivals': 'sum'
}).reset_index()

In [None]:

df_merged = pd.merge(df_tripadvisor, df_tourism_final, left_on="travelDate", right_on="Month")


df_merged.head()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Analyze sentiment polarity trends
sns.regplot(x='average_sentimentPolarity', y='Total tourist arrivals', data=df_merged, scatter_kws={'s': 100})
plt.title('Correlation between Sentiment Polarity and Tourist Arrivals')
plt.xlabel('Sentiment Polarity')
plt.ylabel('Tourist Arrivals')
plt.grid(True)
plt.show()


In [None]:

# Correlation analysis
correlation = df_merged['Total tourist arrivals'].corr(df_merged['average_sentimentPolarity'])
print('Correlation between Tourist Arrivals and Sentiment Polarity:', correlation)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
X = df_merged[['average_sentimentPolarity']]
y = df_merged['Total tourist arrivals']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)