In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
from sklearn.linear_model import LinearRegression

In [None]:
#Importerar filen 
housing = pd.read_csv("housing.csv")

In [None]:
raw_df = pd.DataFrame(housing)

In [None]:
raw_df.head()

In [None]:
raw_df.info()

In [None]:
df = raw_df.dropna()
df.info()

In [None]:
less_than_500001 = df[df["median_house_value"] <= 500000]
# Check so that it worked
less_than_500001[less_than_500001["median_house_value"] > 500000]

In [None]:
df = less_than_500001
df

In [None]:
plt.figure(figsize=(10, 10))

# Backgroud image
background_img = mpimg.imread('./pictures/california-map.jpg')
extent = [-124.48, -114.13, 32.53, 42.01]
scale_factor = 1.08  # Adjust the scale factor as needed for the desired enlargement
new_extent = [
    extent[0] - (extent[1] - extent[0]) * (scale_factor - 1) / 2,
    extent[1] + (extent[1] - extent[0]) * (scale_factor - 1) / 2,
    extent[2] - (extent[3] - extent[2]) * (scale_factor - 1) / 2,
    extent[3] + (extent[3] - extent[2]) * (scale_factor - 1) / 2
]
plt.imshow(background_img, extent=new_extent, aspect='auto')

# Scatter plot
scatter = plt.scatter(df['longitude'], df['latitude'], c=df['median_house_value'], cmap='viridis', alpha=0.6)
plt.colorbar(scatter, label='Median House Value')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Geographical Distribution of Houses')
plt.show()

In [None]:
# Histogrammen hjälper till att förstå fördelningen och spridningen av de numeriska variablerna i datasetet.

df.hist(bins=54, figsize=(20, 15)) # Skapar histogram för alla numeriska kolumner i df och en figurstorlek på 20x15.

for ax in plt.gcf().axes:
    ax.set_ylabel('Number of Houses')

plt.show()

In [None]:
plt.figure(figsize=(12, 8))
scatter = plt.scatter(df['median_income'], df['median_house_value'], 
                      c=df['ocean_proximity'].astype('category').cat.codes, # astype('category').cat.codes konverterar kategoriska värden till numeriska koder för färgning.
                      cmap='viridis', alpha=0.6) # Alpha är satt till 0.6 för att hantera överlappande punkter och göra plottet mer läsbart.
plt.colorbar(scatter, label='Ocean Proximity') # Genom att använda färg för att visa ocean_proximity kan vi se om närheten till havet påverkar husvärden.
plt.xlabel('Median Income (scaled * 10,000)')
plt.ylabel('Median House Value ($)')
plt.title('Median Income vs Median House Value')
plt.grid(True)

plt.show()

In [None]:
df = df.join(pd.get_dummies(df.ocean_proximity)).drop(["ocean_proximity"], axis=1)

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(df.corr(), annot=True, cmap="YlGnBu")

In [None]:
#X = housing.drop(columns=['median_house_value'], axis=1)
#y = housing['median_house_value']
#X_train, X_test, y_train,  y_test = train_test_split(X,y, test_size=0.2, random_state=56)

In [None]:
# Assuming df is your DataFrame
# Let's select the relevant columns for the features and target variable
X = df[['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'population', 'median_income']]
y = df['median_house_value'] 

In [None]:

X_train.head()

In [None]:

X_train.describe()

In [None]:

plt.figure(figsize=(5,5))
sns.pairplot(housing)
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print("\nTraining data size:", X_train.shape)
print("Testing data size:", X_test.shape)

In [None]:
# Fit the training data to a Linear Regression model
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

# Show your regression model's score
train_score = regression_model.score(X_train, y_train)
test_score = regression_model.score(X_test, y_test)
print(f'\nTraining score: {train_score}')
print(f'Testing score: {test_score}')

In [None]:
# Get the coefficients and intercept of the trained regression model
coefficients = regression_model.coef_
intercept = regression_model.intercept_

# Select three samples from the testing set
samples_to_predict = X_test

# Manually calculate the predictions without using loops
predictions = np.dot(samples_to_predict.values, coefficients.reshape(-1, 1)).flatten() + intercept

# Get the actual data for the selected samples from the dataset
actual_data = y_test

# Print the predictions and actual data
for pred, actual in zip(predictions, actual_data):
    print("Prediction:", pred)
    print("Actual data:", actual)
    acc = regression_model.score(X_test, y_test)
    print(acc*100, '%')
    print()