In [1]:
!pip install scikit-learn


Defaulting to user installation because normal site-packages is not writeable


In [27]:
def preprocess() -> dict:
   
    # prefix = '/mnt/d/CS-KUBEFLOW'
    prefix = 'gs://data-cs-kubeflow'
    Path(f"./G4").mkdir(parents=True, exist_ok=True)

    all_data = pd.read_parquet(f"./all_data.parquet")
    ## Input/Outputs in the parquet files:  
    # GR is the natural radioactivity of the rock
    # •RHOB is the density of the rock
    # •DTC is the compressionnal slowness through the rock
    # •NEUT is the neutron porosity and is related to the porosity of the rock
    # •PE is the photoelectric factor
    # •DS_INDEX is the depth where the data were measured
    # •DS_REF_ID is the name of the well
    # The output is :
    # •DT_SHEAR is Shear Slowness
    ## 
    input_col =  ["GR","RHOB","DTC","NEUT",'PE','DS_INDEX','ds_ref_id']
    output_col = "DT_SHEAR"

    # Subsetting the columns
    print("Subsetting the columns")
    all_data = all_data[input_col + [output_col]]

    # Create your own pipeline that will pre-process the data (explore,clean,normalize
    # the data,feature engineering,.. )

    # # Clean the data
    print("Drop rows where target is NaN")
    all_data.dropna(subset=output_col, inplace=True)

    # Split the data into train and test
    print("Split the data into train and test")
    train_data = all_data.sample(frac=0.8,random_state=0)
    test_data = all_data.drop(train_data.index)

    # print first 5 rows of train_data
    # Look for the cell that contains L04_06 and print the row
    print(train_data[train_data['ds_ref_id'] == 'L04_06'])

    ## METHOD 1:
    # fill missing values with mean column values
    # train_data.fillna(train_data.mean(), inplace=True)
    # test_data.fillna(test_data.mean(), inplace=True)

    # ## METHOD 2:
    # # Impute other columns using KNNImputer
    # # Create an instance of the KNNImputer class with k = 3

    print("Impute other columns using KNNImputer")
    imputer = KNNImputer(n_neighbors=3)
    # Impute missing values
    # take into account that one of the columns ds_ref_id is a string
    # so we need to convert it to a number
    # we will use the label encoder
    # create the Labelencoder object
    le = preprocessing.LabelEncoder()
    # convert the categorical columns into numeric
    train_data['ds_ref_id'] = le.fit_transform(train_data['ds_ref_id'])
    test_data['ds_ref_id'] = le.fit_transform(test_data['ds_ref_id'])
    # convert the dataframes to numpy arrays
    train_data = train_data.to_numpy()
    test_data = test_data.to_numpy()
    # impute the missing values
    train_data = imputer.fit_transform(train_data)
    test_data = imputer.fit_transform(test_data)
    
    
    # Convert numpy arrays back to dataframes
    train_data = pd.DataFrame(train_data, columns=input_col + [output_col])
    test_data = pd.DataFrame(test_data, columns=input_col + [output_col])
    # Normalize the data  
    # train_data = (train_data - train_data.mean()) / train_data.std()
    # test_data = (test_data - test_data.mean()) / test_data.std()
    print("Normalize the data")
    train_data = preprocessing.normalize(train_data)
    test_data = preprocessing.normalize(test_data)


    # -------- Debug ----------
    # Count the total number of rows
    total_rows = len(train_data)
    print("total_rows: ", total_rows)
    # Count the number of missing values per column
    missing_values = train_data.isnull().sum()
    print("missing_values: ", missing_values)

    # Save the results to a text file
    with open('{prefix}/G4/debug_train.txt', 'w') as f:
        f.write('Total number of rows: {}\n\n'.format(total_rows))
        f.write('Number of rows with missing values per column:\n')
        for col, count in missing_values.items():
            if count > 0:
                f.write('{}: {}\n'.format(col, count))
    #------------------------

    # Save the train and test data into parquet files
    print("Save the train and test data into parquet files")
    train_data.to_parquet(f"{prefix}/G4/train_data.parquet")
    test_data.to_parquet(f"{prefix}/G4/test_data.parquet")
    print("train_data.parquet and test_data.parquet saved in G4 folder")
    # return json string of train and test data addresses
    print("return json string of train and test data addresses")
    clean_data = json.dumps({
        'train_data': f"{prefix}/G4/train_data.parquet",
        'test_data': f"{prefix}/G4/test_data.parquet"
    })
    return clean_data 


In [28]:
preprocess()

Subsetting the columns
Drop rows where target is NaN
Split the data into train and test
                 GR     RHOB         DTC     NEUT  PE      DS_INDEX ds_ref_id   
6013870   48.610691      NaN   76.310257      NaN NaN   8382.943359    L04_06  \
6003370   22.376289      NaN   81.948196      NaN NaN   4937.893066    L04_06   
5995768         NaN      NaN  149.035553      NaN NaN   2443.677002    L04_06   
6019590   29.182110      NaN   66.286377      NaN NaN  10259.674805    L04_06   
6014001   38.757870      NaN   73.614014      NaN NaN   8425.923828    L04_06   
...             ...      ...         ...      ...  ..           ...       ...   
6022752  122.880577  2.64892   73.994202  0.19834 NaN  11297.126953    L04_06   
6013931   46.167110      NaN   79.991791      NaN NaN   8402.957031    L04_06   
6011729   29.268539      NaN   84.985474      NaN NaN   7680.480957    L04_06   
6011292   29.561569      NaN   70.002472      NaN NaN   7537.101562    L04_06   
6000667   37.897282  

In [4]:
clean_data = preprocess_null(args=None)

Drop rows where target is NaN
Split the data into train and test
Save the train and test data into parquet files
train_data.parquet and test_data.parquet saved in G5 folder
return json string of train and test data addresses


In [19]:
#read the data
prefix = '.'
train_data = pa.read_parquet(f"{prefix}/G5/train_data.parquet")

# count nan values
print("Count the number of missing values per column")
missing_values = train_data.isnull().sum()
print("missing_values: ", missing_values)


Count the number of missing values per column
missing_values:  GR           0
RHOB         0
DENC         0
DTC          0
DT_SHEAR     0
PE           0
BS           0
TH           0
NEUT         0
URAN         0
CGR          0
POTA         0
CALI         0
DEEPRES      0
SHALRES      0
DS_INDEX     0
ds_ref_id    0
dtype: int64


In [5]:
# import rf model
from linear_regression.linear_regression import linear_regression

# train the model
model = linear_regression(clean_data=clean_data, score=0.0)

The Score of Linear Regression is -5.14279096641701e-07


NameError: name 'args' is not defined

In [23]:
# import rf model
from lasso_regression.lasso_regression import lasso_regression as model

# train the model
score = model(clean_data=clean_data, score=0.0)

  model = cd_fast.enet_coordinate_descent(


The Score of Lasso Regression is 0.04037727687645043


NameError: name 'args' is not defined