In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score,mean_squared_error, r2_score

# Read the CSV file, using semicolon as a delimiter and skipping the last empty column
data = pd.read_csv("Cities1.csv", delimiter=';', usecols=lambda x: 'Unnamed' not in x)

# Clean up column names by removing leading/trailing spaces
data.columns = data.columns.str.strip()

print("Column names after cleaning:")
print(data.columns)

Column names after cleaning:
Index(['City,Region,Country,AirQuality,WaterPollution'], dtype='object')


In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Read the CSV file, using semicolon as a delimiter and skipping the last empty column
data = pd.read_csv("Cities1.csv", delimiter=';', usecols=lambda x: 'Unnamed' not in x)

# The data is in a single column, so we need to split it by comma
data = data['City,Region,Country,AirQuality,WaterPollution'].str.split(',', expand=True)

# Rename the columns - accounting for the extra column
data.columns = ['City', 'Region', 'Country', 'AirQuality', 'WaterPollution', 'ExtraColumn']

# Clean up column names by removing leading/trailing spaces
data.columns = data.columns.str.strip()

# Convert 'AirQuality' and 'WaterPollution' to numeric, coercing errors to NaN
data['AirQuality'] = pd.to_numeric(data['AirQuality'], errors='coerce')
data['WaterPollution'] = pd.to_numeric(data['WaterPollution'], errors='coerce')

# Drop the 'ExtraColumn' as it's not needed
data = data.drop(columns=['ExtraColumn'])


# Display the first few rows and information about the data
print(data.head())
print(data.info())

# Define features (x) and target variable (y) based on the Cities1.csv dataset
# We'll use 'AirQuality', 'City', 'Region', and 'Country' as features to predict 'WaterPollution'
features = ['AirQuality', 'City', 'Region', 'Country']
target = 'WaterPollution'

# Handle missing values by dropping rows with NaNs and create a copy
data_cleaned = data.dropna(subset=features + [target]).copy()

# Apply Label Encoding to categorical features
label_encoders = {}
for col in ['City', 'Region', 'Country']:
    label_encoders[col] = LabelEncoder()
    data_cleaned[col] = label_encoders[col].fit_transform(data_cleaned[col])


x_cleaned = data_cleaned[features]
y_cleaned = data_cleaned[target]


# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_cleaned, y_cleaned, test_size=0.2, random_state=2)

# Initialize and train the Linear Regression model
# Since the target variable 'WaterPollution' is continuous, Linear Regression is appropriate.
lrr = LinearRegression()
lrr.fit(x_train, y_train)
y_lrr_pred = lrr.predict(x_test)

mse = mean_squared_error(y_test, y_lrr_pred)
r2 = r2_score(y_test, y_lrr_pred)

print("Linear Regression MSE:", mse)
print("Linear Regression R2:", r2)

# The Decision Tree Classifier is not suitable for this regression task,
# so I will remove the Decision Tree code.

            City      Region                   Country  AirQuality  \
0  New York City    New York  United States of America   46.816038   
1     Washington        D.C.      District of Columbia         NaN   
2  San Francisco  California  United States of America   60.514019   
3         Berlin                               Germany   62.364130   
4    Los Angeles  California  United States of America   36.621622   

   WaterPollution  
0       49.504950  
1       66.129032  
2       43.000000  
3       28.612717  
4       61.299435  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3963 entries, 0 to 3962
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   City            3963 non-null   object 
 1   Region          3963 non-null   object 
 2   Country         3963 non-null   object 
 3   AirQuality      3962 non-null   float64
 4   WaterPollution  3963 non-null   float64
dtypes: float64(2), object(3)
memory u