In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [6]:
import requests
from bs4 import BeautifulSoup

# Define the URL for lunar phase data
url = "https://astropixels.com/ephemeris/phasescat/phasescat.html"

# Fetch the page content
response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Locate the table containing lunar phase data
    table = soup.find("table")
    moon_phases = []
    
    if table:
        for row in table.find_all("tr")[1:]:  # Skip header row
            cols = row.find_all("td")
            if len(cols) >= 2:
                date = cols[0].text.strip()
                phase = cols[1].text.strip()
                moon_phases.append({"date": date, "phase": phase})
    
    # Convert to DataFrame
    moon_df = pd.DataFrame(moon_phases)
    
    # Convert date to datetime format
    moon_df["date"] = pd.to_datetime(moon_df["date"], errors="coerce")

    
    moon_df

In [26]:
div_container = soup.find("div", id="container825")

print(div_container)

<div id="container825">
<div id="header">
<h1><img align="center" alt="AstroPixels Banner" height="50" src="../../images/AstroPixels-2w.gif" width="801"/></h1>
<p>
| <a href="../../index.html" title="Home">Home</a> |
<a href="http://www.astropixels.com/blog/" title="Blog">Blog</a> | 

<a href="../../main/recent.html" title="Recent Images">Recent Images</a> | 
<a href="../../main/photoindex.html" title="Photo Index">Photo Index</a> | 
<a href="../../bifrost/bifrostindex.html" title="Bifrost Index">Bifrost</a> | 
<a href="../../ephemeris/ephemeris.html" title="Ephemeris">Ephemeris</a> | 
<a href="../../main/resources.html" title="Resources">Resources</a> | 
<a href="../../main/astronews.html" title="News">News</a> | 
<a href="../../main/contact.html" title="Contact">Contact</a> | 
</p>
<p>
<a href="../../moon/phases2/phasesmosaics.html">
<img alt="Moon Phase Mosaic" class="center" src="../../moon/phases2/images/Phases10-7x-2w.jpg" style="padding:5px 0px;" width="80%"/>
<br/><strong> Comp

In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# URL of the lunar phase page
url = "https://astropixels.com/ephemeris/phasescat/phases2001.html"

# Headers to mimic a real browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://astropixels.com/"
}

# Fetch the page content
response = requests.get(url, headers=headers)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")

    # Locate the div containing lunar phase data
    div_container = soup.find("div", class_="pbox1a")

    if div_container:
        # Extract text content
        full_text = div_container.get_text("\n", strip=True)

        # Debugging: Print first 20 lines
        print("Extracted Text Preview:\n", "\n".join(full_text.split("\n")[:20]))

        # Split text into lines
        lines = full_text.split("\n")

        # Initialize DataFrame storage
        data = []
        year = None

        # Define a regex pattern to extract lunar phase data
        phase_pattern = re.compile(r"(\w{3,4})\s+(\d{1,2})\s+(\d{2}:\d{2})")

        for line in lines:
            line = line.strip()

            # Detect a new year
            if re.match(r"^\d{4}$", line):
                year = line  # Store the current year
                continue  # Move to the next line
            
            # Extract lunar phase data from the row
            matches = phase_pattern.findall(line)

            if matches:
                # Fill in missing phases if the row starts with blanks
                while len(matches) < 4:
                    matches.insert(0, ("", "", ""))

                # Convert to formatted row
                row = [year] + [" ".join(m).strip() if m != ("", "", "") else "" for m in matches]
                
                # Ensure only valid rows are added
                if len(row) == 5:
                    data.append(row)

        # Create DataFrame
        columns = ["Year", "New Moon", "First Quarter", "Full Moon", "Last Quarter"]
        moon_df = pd.DataFrame(data, columns=columns)

        # Check if DataFrame is empty before saving
        if not moon_df.empty:
            csv_filename = "lunar_phases.csv"
            moon_df.to_csv(csv_filename, index=False)
            print(f"Lunar phases data successfully saved to {csv_filename}")
        else:
            print("⚠️ Warning: Extracted DataFrame is empty. Check site structure manually.")

    else:
        print("⚠️ Failed to find <div class='pbox1a'>. The page structure might have changed.")

else:
    print(f"⚠️ Failed to retrieve lunar phase data. HTTP Status Code: {response.status_code}")


Extracted Text Preview:
 Phases of the Moon: 2001 to 2010
Universal Time (UT)
Year      New Moon       First Quarter       Full Moon       Last Quarter
2001                     Jan  2  22:31     Jan  9  20:24 t   Jan 16  12:35    
        Jan 24  13:07     Feb  1  14:02     Feb  8  07:12     Feb 15  03:24    
        Feb 23  08:21     Mar  3  02:03     Mar  9  17:23     Mar 16  20:45    
        Mar 25  01:21     Apr  1  10:49     Apr  8  03:22     Apr 15  15:31    
        Apr 23  15:26     Apr 30  17:08     May  7  13:53     May 15  10:11    
        May 23  02:46     May 29  22:09     Jun  6  01:39     Jun 14  03:28    
        Jun 21  11:58 T   Jun 28  03:20     Jul  5  15:04 p   Jul 13  18:45    
        Jul 20  19:44     Jul 27  10:08     Aug  4  05:56     Aug 12  07:53    
        Aug 19  02:55     Aug 25  19:55     Sep  2  21:43     Sep 10  19:00    
        Sep 17  10:27     Sep 24  09:31     Oct  2  13:49     Oct 10  04:20    
        Oct 16  19:23     Oct 24  02:58     Nov  

In [38]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re


url = "https://astropixels.com/ephemeris/phasescat/phases2001.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://astropixels.com/"
}


response = requests.get(url, headers=headers)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")

    
    lunar_phase_divs = soup.find_all("div", class_="pbox1a")

    if lunar_phase_divs:
        all_data = []
        year = None

        for div in lunar_phase_divs:
            pre_tag = div.find("pre")
            if pre_tag:
                full_text = pre_tag.get_text("\n", strip=True)
                lines = full_text.split("\n")

                for line in lines:
                    line = line.strip()

                    
                    if re.match(r"^\d{4}$", line):
                        year = line
                        continue  

                   
                    parts = re.split(r"\s{2,}", line)

                    
                    if len(parts) == 4:
                        parts.insert(0, year)

                    
                    if len(parts) == 5:
                        all_data.append(parts)

        
        columns = ["Year", "New Moon", "First Quarter", "Full Moon", "Last Quarter"]
        moon_df = pd.DataFrame(all_data, columns=columns)

        
        print("\nExtracted Data Preview:")
        print(moon_df.head())

        
        if not moon_df.empty:
            csv_filename = "lunar_phases_2001_2100.csv"
            moon_df.to_csv(csv_filename, index=False)
            print(f"\nLunar phases data successfully saved to {csv_filename}")
        else:
            print("\nExtracted DataFrame is empty. Check extracted text manually.")

    else:
        print("\nNo lunar phase data found. Check site structure manually.")

else:
    print(f"\nFailed to retrieve lunar phase data. HTTP Status Code: {response.status_code}")



Extracted Data Preview:
   Year  New Moon  First Quarter  Full Moon  Last Quarter
0  Year  New Moon  First Quarter  Full Moon  Last Quarter
1  Year  New Moon  First Quarter  Full Moon  Last Quarter
2  None      2002            Jan          6         03:55
3  Year  New Moon  First Quarter  Full Moon  Last Quarter
4  None    Dec 23          09:43     Dec 30         10:03

Lunar phases data successfully saved to lunar_phases_2001_2100.csv


In [39]:
df = pd.read_csv(r"C:\Users\Batia\Desktop\DataScienceNotebooks\Studentai\Vladimir\autoplius_listings.csv")

In [40]:
df

Unnamed: 0,Header,Link
0,BMW 428 Gran Coupe,https://autoplius.lt/skelbimai/bmw-428-gran-co...
1,BMW 530,https://autoplius.lt/skelbimai/bmw-530-2-0-l-s...
2,Volkswagen Golf,https://autoplius.lt/skelbimai/volkswagen-golf...
3,Fiat 500X,https://autoplius.lt/skelbimai/fiat-500x-2-4-l...
4,BMW 320 Gran Turismo,https://autoplius.lt/skelbimai/bmw-320-gran-tu...
...,...,...
6935,Citroen DS3,https://autoplius.lt/skelbimai/citroen-ds3-1-6...
6936,Citroen C5,https://autoplius.lt/skelbimai/citroen-c5-2-0-...
6937,Volkswagen Touran,https://autoplius.lt/skelbimai/volkswagen-tour...
6938,Toyota Auris,https://autoplius.lt/skelbimai/toyota-auris-1-...


In [None]:
df = #sql užklausos vykdymas per mysql/mariadb kursorių
sql="""paste SQL command here!"""
C.execute(sql)
ans = C.fetchall()
? = list(map(list, zip(*ans)))
#? = np.transpose(ans)

In [59]:
import sqlite3
import pandas as pd


conn = sqlite3.connect(r"C:\Users\Batia\Desktop\DataScienceNotebooks\Studentai\Vladimir\cars1.db")


df = pd.read_sql("SELECT brand, price, engine, fuel_type, mileage, registration_year FROM car_listings", con=conn)


conn.close()


print(df.head())


        brand     price                    engine         fuel_type  \
0         BMW  18 000 €  1997 cm³, 245 AG (180kW)          Benzinas   
1         BMW  26 900 €  1998 cm³, 252 AG (185kW)          Benzinas   
2  Volkswagen   7 100 €   1598 cm³, 105 AG (77kW)         Dyzelinas   
3        Fiat  10 500 €  2360 cm³, 177 AG (130kW)  Benzinas / dujos   
4         BMW  14 500 €  1995 cm³, 190 AG (140kW)         Dyzelinas   

      mileage registration_year  
0  199 000 km              2015  
1   46 000 km              2019  
2  314 086 km           2014-03  
3  126 211 km              2016  
4  162 000 km           2016-06  


In [None]:
# Gamintojas, Amžius, Kuro tipas, Rida, variklio galia kW, kaina Eurais

In [60]:
def clean_and_convert(column, unit):
    return pd.to_numeric(
        df[column].str.replace(unit, '', regex=True).str.replace(' ', ''), 
        errors='coerce'
    )


df['price'] = clean_and_convert('price', '€')
df['mileage'] = clean_and_convert('mileage', 'km')

In [61]:
from datetime import datetime

df = df[df['registration_year'].str[:4].str.isnumeric()]
df['registration_year'] = df['registration_year'].str[:4].astype(int)

now = datetime.now().year
df['age'] = now - df['registration_year']

In [62]:
df['kw'] = df['engine'].str.extract(r'\((\d+)kW\)')

df['kw'] = pd.to_numeric(df['kw'], errors='coerce')

In [47]:
df

Unnamed: 0,brand,price,engine,fuel_type,mileage,registration_year,age,kw
0,BMW,18000,"1997 cm³, 245 AG (180kW)",Benzinas,199000.0,2015,10,180.0
1,BMW,26900,"1998 cm³, 252 AG (185kW)",Benzinas,46000.0,2019,6,185.0
2,Volkswagen,7100,"1598 cm³, 105 AG (77kW)",Dyzelinas,314086.0,2014,11,77.0
3,Fiat,10500,"2360 cm³, 177 AG (130kW)",Benzinas / dujos,126211.0,2016,9,130.0
4,BMW,14500,"1995 cm³, 190 AG (140kW)",Dyzelinas,162000.0,2016,9,140.0
...,...,...,...,...,...,...,...,...
6188,Hyundai,2850,"1120 cm³, 75 AG (55kW)",Dyzelinas,192000.0,2013,12,55.0
6189,Citroen,2850,"1560 cm³, 116 AG (85kW)",Dyzelinas,89652.0,2015,10,85.0
6190,Volkswagen,2850,"1900 cm³, 105 AG (77kW)",Dyzelinas,320321.0,2007,18,77.0
6191,Toyota,2890,"1364 cm³, 88 AG (65kW)",Dyzelinas,290000.0,2007,18,65.0


In [64]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

In [65]:
gamLE = LabelEncoder()
ktLE = LabelEncoder()
gamLE.fit(df['brand'])
df['brand'] = gamLE.transform(df['brand'])
ktLE.fit(df['fuel_type'])
df['fuel_type'] = ktLE.transform(df['fuel_type'])

In [63]:

df = df.drop(columns=['engine', 'registration_year'])

In [66]:
Y_target = df['price']
X_features = df.drop(columns=['price'])

X_train, X_test, y_train, y_test = train_test_split(X_features, Y_target, test_size=0.15, random_state=0)

In [67]:
scaler = StandardScaler()
scaler.fit(X_train[['age','mileage','kw']])
X_train_scaled = scaler.transform(X_train[['age','mileage','kw']]); X_train_scaled

array([[-1.32201565, -0.64141257, -0.27837915],
       [-0.493517  , -1.02440786, -0.6767854 ],
       [-0.16211755,  1.09018925, -0.59710415],
       ...,
       [-0.493517  ,  0.42409091,  0.51843337],
       [-0.32781727,  0.13712286,  0.99652087],
       [-1.48771537, -1.35039439,  0.04034586]])

In [68]:
PR_sc, R_sc, V_sc = zip(*X_train_scaled)
X_train['age'] = PR_sc
X_train['mileage'] = R_sc
X_train['kw'] = V_sc ; X_train

Unnamed: 0,brand,fuel_type,mileage,age,kw
2142,45,1,-0.641413,-1.322016,-0.278379
3838,8,5,-1.024408,-0.493517,-0.676785
4020,52,1,1.090189,-0.162118,-0.597104
4473,17,5,0.042954,1.163480,-0.119017
3235,52,1,0.733070,2.157679,-0.852084
...,...,...,...,...,...
4974,17,5,0.042954,1.163480,-0.119017
3307,6,1,-0.120817,0.832081,-0.756467
1654,1,5,0.424091,-0.493517,0.518433
2615,33,5,0.137123,-0.327817,0.996521


In [69]:
# Apmokinome X_train
# Dabar su standart Scaler rescaliname X_test:
X_test_scaled = scaler.transform(X_test[['age','mileage','kw']]); X_test_scaled

array([[ 0.16928191, -0.44835911, -0.3899329 ],
       [ 0.83208083, -0.3500964 , -0.97957416],
       [ 0.50068137, -0.53843326, -0.56523165],
       ...,
       [ 0.83208083,  0.69803915, -0.03933539],
       [-1.48771537, -1.38185483,  0.82122212],
       [ 1.16348028,  0.53426797,  0.51843337]])

In [70]:
PR_sc, R_sc, V_sc = zip(*X_test_scaled)
X_test['age'] = PR_sc
X_test['mileage'] = R_sc
X_test['kw'] = V_sc ; X_test

Unnamed: 0,brand,fuel_type,mileage,age,kw
124,52,5,-0.448359,0.169282,-0.389933
4944,37,2,-0.350096,0.832081,-0.979574
4015,44,1,-0.538433,0.500681,-0.565232
2234,53,5,0.526079,-0.659217,0.725605
6063,8,5,-1.024408,-0.493517,-0.676785
...,...,...,...,...,...
2942,2,1,-0.751336,-1.156316,3.944727
531,2,5,1.005798,0.500681,-0.358060
3933,22,5,0.698039,0.832081,-0.039335
2801,51,3,-1.381855,-1.487715,0.821222


In [72]:
df.dropna(inplace=True)

In [74]:
df.replace('Nėra', pd.NA, inplace=True)

In [76]:
df = df.fillna(df.mean())

In [78]:
X_train.dropna(inplace=True)

In [80]:
print(X_train.shape)
print(y_train.shape)

(5022, 5)
(5227,)


In [81]:
df = df.dropna()

In [82]:
linearML = LinearRegression() # lininis modelis
linearML.fit(X_train, y_train) # apmokinimas

ValueError: Found input variables with inconsistent numbers of samples: [5022, 5227]

In [None]:
y_predicted = linearML.predict(X_test)

In [None]:
fig, ax = plt.subplots()
ax.scatter(X_test.index, y_test, label='Test data')
ax.scatter(X_test.index, y_predicted, label='predict.')
ax.legend(loc='best')
plt.show()

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [None]:
r2_score(y_test, y_predicted)

In [None]:
mean_squared_error(y_test, y_predicted)

In [None]:
rmse = mean_squared_error(y_test, y_predicted)**(1/2)
rmse