In [94]:
import pandas as pd
import numpy as np
import warnings
import tqdm
warnings.filterwarnings("ignore")

In [95]:
TIME_CUT = "1973-01-01"
SPLIT_DATE_TRAIN = "2020-01-01"
MAG_TH = 5
GEO_SPLIT = 3

In [96]:
df = pd.read_csv("../data/with_features.csv")
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
df

In [None]:
df["longitude_disc"] = (df["longitude"] // GEO_SPLIT * GEO_SPLIT).astype(int)
df["latitude_disc"] = (df["latitude"] // GEO_SPLIT * GEO_SPLIT).astype(int)
df["pos"] = df["latitude_disc"].astype(str) + "_" + df["longitude_disc"].astype(str)

In [None]:
df

Unnamed: 0,time,longitude,latitude,depth,mag,magType,time_disc,longitude_disc,latitude_disc,pos,lat_cent,lon_cent,plate_region,dist_region,dist,plate,label
0,1973-01-01 01:05:56.150,-117.588000,34.189833,6.000,1.70,5,1973-01-01,-120,33,33_-120,34.5,-117.5,1,16.691592,19.302507,1,0
1,1973-01-01 04:46:09.800,150.634000,-9.214000,41.000,5.30,3,1973-01-01,150,-12,-12_150,-9.5,150.5,51,21.124956,55.729840,61,0
2,1973-01-01 05:20:59.780,-122.117333,48.308667,13.680,2.20,2,1973-01-01,-123,48,48_-123,48.5,-122.5,3,296.114618,314.738391,3,0
3,1973-01-01 06:22:29.800,-173.958000,-15.012000,33.000,5.00,3,1973-01-01,-174,-18,-18_-174,-15.5,-173.5,34,84.317941,18.239739,42,0
4,1973-01-01 08:58:11.460,-155.360833,19.443667,7.302,1.85,1,1973-01-01,-156,18,18_-156,19.5,-155.5,2,3527.668174,3522.498687,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4100001,2023-10-01 01:27:41.780,-179.015000,51.730000,10.980,1.22,1,2023-10-01,-180,51,51_-180,51.5,-179.5,1,113.570487,137.833943,1,0
4100002,2023-10-01 01:30:30.046,152.699600,-4.342900,65.434,4.90,3,2023-10-01,150,-6,-6_150,-4.5,152.5,33,39.066841,17.721537,35,0
4100003,2023-10-01 01:33:05.281,-152.662000,60.099900,109.400,1.90,1,2023-10-01,-153,60,60_-153,60.5,-152.5,1,397.084905,387.824459,1,0
4100004,2023-10-01 01:41:39.645,-175.317600,51.902800,75.100,2.10,1,2023-10-01,-177,51,51_-177,51.5,-175.5,1,84.433512,123.080765,1,0


In [None]:
df_tp = pd.read_csv("../data/all.csv")
df_tp.drop_duplicates(inplace=True)
df_tp.reset_index(drop=True, inplace=True)
df_tp

Unnamed: 0,plate,lat,lon
0,am,30.754,132.824
1,am,30.970,132.965
2,am,31.216,133.197
3,am,31.515,133.500
4,am,31.882,134.042
...,...,...,...
11803,yz,20.561,112.784
11804,yz,20.137,113.030
11805,yz,19.713,113.274
11806,yz,19.288,113.517


In [None]:
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371.0
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    distance = R * c
    return distance

In [None]:
def find_min_dist(df_tp, x, y):
    df_tp["dist"] = haversine_distance(x, y, df_tp["lat"], df_tp["lon"])
    min_dist = df_tp["dist"].min()
    plates = df_tp[df_tp["dist"] == min_dist].sort_values("plate")["plate"].tolist()
    plates = "_".join(plates)
    return min_dist, plates

In [None]:
def add_region_info(df, df_tp):
    region2plate = {}
    region2dist = {}
    df["lat_cent"] = df["latitude_disc"] + GEO_SPLIT / 2
    df["lon_cent"] = df["longitude_disc"] + GEO_SPLIT / 2
    for pos in tqdm.tqdm(df["pos"].unique()):
        x, y = pos.split("_")
        x, y = float(x), float(y)
        dist, plate = find_min_dist(df_tp, x + GEO_SPLIT / 2, y + GEO_SPLIT / 2)
        region2plate[pos] = plate
        region2dist[pos] = dist
    df["plate_region"] = df["pos"].map(region2plate)
    df["dist_region"] = df["pos"].map(region2dist)
    return df

In [None]:
def add_tectonic_info(df, df_tp):
    coordinates = list(zip(df['latitude'], df['longitude']))
    results = list(tqdm.tqdm(map(lambda x: find_min_dist(df_tp, x[0], x[1]), coordinates), total=len(coordinates)))
    df[["dist", "plate"]] = results
    return df

In [None]:
def add_features(df, df_tp, mag_th):
    df_final = None
    df["time"] = pd.to_datetime(df["time"], format="mixed")
    df["time_disc"] = pd.to_datetime(df["time_disc"], format="mixed")
    df = add_region_info(df, df_tp)
    # df = add_tectonic_info(df, df_tp)
    for pos in tqdm.tqdm(df["pos"].unique()):
        dfs = []
        tmp = df[df["pos"] == pos]
        tmp.sort_values("time", inplace=True)
        for time in tmp["time_disc"].unique():
            tmp_t0 = tmp[tmp["time_disc"] == time]
            t1 = time + pd.DateOffset(months=1)
            tmp_t1 = tmp[(tmp["time_disc"] > time) & (tmp["time_disc"] <= t1)]
            if tmp_t1.empty:
                max_mag = -1e8
            else:
                max_mag = tmp_t1["mag"].max()
            tmp_t0["label"] = 0 if max_mag < mag_th else 1
            dfs.append(tmp_t0)
        df_tmp = pd.concat(dfs)
        df_final = pd.concat([df_final, df_tmp])
    return df_final

In [None]:
df_final = add_features(df, df_tp, MAG_TH)

100%|██████████| 3099/3099 [00:14<00:00, 208.66it/s]
100%|██████████| 3099/3099 [1:27:46<00:00,  1.70s/it]  


In [None]:
def generateTEXtable(df, out, n=5):
    columns = df.columns
    with open(out, 'w') as file:
        file.writelines([
            "\\begin{table}[h]\n",
            "    \centering\n",
            "    \\begin{tabularx}{\\textwidth}{|" + "X|"*len(columns) + "} \\hline\n"
        ])

        header = " "*8 + "\\textbf{" + columns[0] + "}"
        for col in columns[1:]:
            header = header + " & \\textbf{" + col + "}"
        header = header + " \\\\ \\hline\n"
        file.write(header)

        lines = []
        for i in range(n):
            try:
                line = f"{df.loc[i, columns[0]]:.2f}"
            except ValueError:
                line = df.loc[i, columns[0]]
            for col in columns[1:]:
                try:
                    val = f"{df.loc[i, col]:.2f}"
                except ValueError:
                    val = df.loc[i, col]
                line = line + " & " + val
            line = " "*8 + line + " \\\\ \\hline\n"

            lines.append(line)

        file.writelines(lines)

        file.writelines([
            "    \\end{tabularx}\n",
            "    \\caption{caption}\n",
            "    \\label{label}\n",
            "\\end{table}\n"
        ])

In [None]:
df_final = pd.read_csv("../data/with_features_notmapped.csv")

In [None]:
df_final

Unnamed: 0,time,longitude,latitude,depth,mag,magType,time_disc,longitude_disc,latitude_disc,pos,lat_cent,lon_cent,plate_region,dist_region,dist,plate,label
0,1973-01-01 01:05:56.150,-117.588000,34.189833,6.000,1.70,mh,1973-01-01,-118,34,34_-118,34.5,-117.5,na_pa,16.691592,19.302507,na_pa,0
1,1973-01-01 04:46:09.800,150.634000,-9.214000,41.000,5.30,mb,1973-01-01,150,-10,-10_150,-9.5,150.5,WL_au,21.124956,55.729840,WL_au,0
2,1973-01-01 05:20:59.780,-122.117333,48.308667,13.680,2.20,md,1973-01-01,-123,48,48_-123,48.5,-122.5,jf_na,296.114618,314.738391,jf_na,0
3,1973-01-01 06:22:29.800,-173.958000,-15.012000,33.000,5.00,mb,1973-01-01,-174,-16,-16_-174,-15.5,-173.5,NI_TO,84.317941,18.239739,NI_TO,0
4,1973-01-01 08:58:11.460,-155.360833,19.443667,7.302,1.85,ml,1973-01-01,-156,19,19_-156,19.5,-155.5,jf_pa,3527.668174,3522.498687,jf_pa,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4100001,2023-10-01 01:27:41.780,-179.015000,51.730000,10.980,1.22,ml,2023-10-01,-180,51,51_-180,51.5,-179.5,na_pa,113.570487,137.833943,na_pa,0
4100002,2023-10-01 01:30:30.046,152.699600,-4.342900,65.434,4.90,mb,2023-10-01,152,-5,-5_152,-4.5,152.5,NB_SB,39.066841,17.721537,NB_SB,0
4100003,2023-10-01 01:33:05.281,-152.662000,60.099900,109.400,1.90,ml,2023-10-01,-153,60,60_-153,60.5,-152.5,na_pa,397.084905,387.824459,na_pa,0
4100004,2023-10-01 01:41:39.645,-175.317600,51.902800,75.100,2.10,ml,2023-10-01,-176,51,51_-176,51.5,-175.5,na_pa,84.433512,123.080765,na_pa,0


In [None]:
c = df_final.columns[:6].tolist() + df_final.columns[7:10].tolist() + ["dist", "plate"] + df_final.columns[10:-3].tolist() + ["time_disc", "label"]

In [None]:
tmp = df_final[c]
tmp

Unnamed: 0,time,longitude,latitude,depth,mag,magType,longitude_disc,latitude_disc,pos,dist,plate,lat_cent,lon_cent,plate_region,dist_region,time_disc,label
0,1973-01-01 01:05:56.150,-117.588000,34.189833,6.000,1.70,mh,-118,34,34_-118,19.302507,na_pa,34.5,-117.5,na_pa,16.691592,1973-01-01,0
1,1973-01-01 04:46:09.800,150.634000,-9.214000,41.000,5.30,mb,150,-10,-10_150,55.729840,WL_au,-9.5,150.5,WL_au,21.124956,1973-01-01,0
2,1973-01-01 05:20:59.780,-122.117333,48.308667,13.680,2.20,md,-123,48,48_-123,314.738391,jf_na,48.5,-122.5,jf_na,296.114618,1973-01-01,0
3,1973-01-01 06:22:29.800,-173.958000,-15.012000,33.000,5.00,mb,-174,-16,-16_-174,18.239739,NI_TO,-15.5,-173.5,NI_TO,84.317941,1973-01-01,0
4,1973-01-01 08:58:11.460,-155.360833,19.443667,7.302,1.85,ml,-156,19,19_-156,3522.498687,jf_pa,19.5,-155.5,jf_pa,3527.668174,1973-01-01,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4100001,2023-10-01 01:27:41.780,-179.015000,51.730000,10.980,1.22,ml,-180,51,51_-180,137.833943,na_pa,51.5,-179.5,na_pa,113.570487,2023-10-01,0
4100002,2023-10-01 01:30:30.046,152.699600,-4.342900,65.434,4.90,mb,152,-5,-5_152,17.721537,NB_SB,-4.5,152.5,NB_SB,39.066841,2023-10-01,0
4100003,2023-10-01 01:33:05.281,-152.662000,60.099900,109.400,1.90,ml,-153,60,60_-153,387.824459,na_pa,60.5,-152.5,na_pa,397.084905,2023-10-01,0
4100004,2023-10-01 01:41:39.645,-175.317600,51.902800,75.100,2.10,ml,-176,51,51_-176,123.080765,na_pa,51.5,-175.5,na_pa,84.433512,2023-10-01,0


In [None]:
generateTEXtable(tmp, "test.txt", n=5)

In [None]:
def make_mapping(df, col, n, SPLIT_DATE_TRAIN):
    type2id = {type: i+1 for i, type in enumerate(df[df["time"] < SPLIT_DATE_TRAIN][col].value_counts().index[:n])}
    type2id.update({type: n+1 for type in df[df["time"] < SPLIT_DATE_TRAIN][col].value_counts().index[n:]})
    type2id.update({x: n+1 for x in df[col].unique() if x not in type2id})
    df[col] = df[col].map(type2id)
    return df, type2id

In [None]:
df_final["plate_region"].value_counts()[60:]

plate_region
ON_yz       2347
lw_nu       2261
an_pa       2156
am_ps       2033
na_nu       2024
            ... 
an_sa_sc      13
au_mq          5
ca_co_na       4
ND_ca_sa       2
eu_su_yz       1
Name: count, Length: 90, dtype: int64

In [None]:
df_final, plate_region2id = make_mapping(df_final, "plate_region", 60, SPLIT_DATE_TRAIN)

In [None]:
pd.DataFrame(plate_region2id.items(), columns=["plate_region", "plate_region_id"]).to_csv(f"../data/plate_region2id_{GEO_SPLIT}.csv", index=False)

In [None]:
df_final.to_csv(f"../data/with_features_{GEO_SPLIT}.csv", index=False)