In [95]:
import pandas as pd
import psycopg2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline


sellers_df = pd.read_csv("data/olist_sellers_dataset.csv")

sellers_df.head()

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP


## Check the total number of rows with unique seller id

In [96]:
#Total unique rows
len(sellers_df["seller_id"].unique())


3095

## Check the total number of totals in given the dataframe

In [97]:
#Check the length of the dataframe
len(sellers_df)


3095

## check for the duplicates rows

In [98]:
#Check for duplicates rows
len(sellers_df.drop_duplicates())

3095

## Get information on the dataframe heading - includes datatype

In [99]:
#information about the dataframe 

sellers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   int64 
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: int64(1), object(3)
memory usage: 96.8+ KB


## Total count of rows and columns

In [100]:
# Get the count of total rows and columns
sellers_df.shape

(3095, 4)

## Check on the sum of empty cells in the dataframe columns

In [101]:
# Calculate the sum of empty cells in each column
print("\nEmpty cells count in DataFrame:")
print(sellers_df.isnull().sum())



Empty cells count in DataFrame:
seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64


## Matching of Full State Name based on state abbreviation column 

In [102]:
# Assuming you have a dictionary mapping two-letter state abbreviations to full names
state_mapping = {
'SP':'Sao Paulo',
'RN':'Rio Grande do Norte',
'AC':'Acre',
'RJ':'Rio de Janeiro',
'ES':'Espirito Santo',
'MG':'Minas Gerais',
'BA':'Bahia',
'SE':'Sergipe',
'PE':'Pernambuco',
'AL':'Alagoas',
'PB':'Paraiba',
'CE':'Ceara',
'PI':'Piaui',
'MA':'Maranhao',
'PA':'Para',
'AP':'Amapa',
'AM':'Amazonas',
'RR':'Roraima',
'DF':'Distrito Federal',
'GO':'Goias',
'RO':'Rondonia',
'TO':'Tocantins',
'MT':'Mato Grosso',
'MS':'Mato Grosso do Sul',
'RS':'Rio Grande do Sul',
'PR':'Parana',
'SC':'Santa Catarina'}

# Assuming your DataFrame is 'sellers_df' and the state abbreviation column is 'seller_state'
sellers_df['seller_state_fullname'] = sellers_df['seller_state'].map(state_mapping)

sellers_df.head()

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state,seller_state_fullname
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP,Sao Paulo
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP,Sao Paulo
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ,Rio de Janeiro
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP,Sao Paulo
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP,Sao Paulo


## Fill the zip code with a leading zero if the length of the zip code is 4 digits

In [103]:

# Add leading zero to 4-digit zip codes
sellers_df['seller_zip_code_prefix'] = sellers_df['seller_zip_code_prefix'].astype(str).str.zfill(5)


# Display the modified DataFrame
print(sellers_df.head())


                          seller_id seller_zip_code_prefix        seller_city  \
0  3442f8959a84dea7ee197c632cb2df15                  13023           campinas   
1  d1b65fc7debc3361ea86b5f14c68d2e2                  13844         mogi guacu   
2  ce3ad9de960102d0677a81f5d0bb7b2d                  20031     rio de janeiro   
3  c0f3eea2e14555b6faeea3dd58c1b1c3                  04195          sao paulo   
4  51a04a8a6bdcb23deccc82b0b80742cf                  12914  braganca paulista   

  seller_state seller_state_fullname  
0           SP             Sao Paulo  
1           SP             Sao Paulo  
2           RJ        Rio de Janeiro  
3           SP             Sao Paulo  
4           SP             Sao Paulo  


## Checking of spelling in seller_city in sellers_df dataframe

In [105]:

# Read the city spelling check file
matching_word_df = pd.read_csv("data/city_spelling_check.csv")


# Sample dataset of misspelled words and their corrections
X_train = matching_word_df['wrong_word']
y_train = matching_word_df['right_word']

# Feature extraction and model training pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='char', ngram_range=(1, 3))),
    ('clf', DecisionTreeClassifier())
])

# Train the model
pipeline.fit(X_train, y_train)

# Example usage
input_words = sellers_df['seller_city']
suggestedword = []
for word in input_words:
    suggestion = pipeline.predict([word])[0]
    suggestedword.append(suggestion) # Append the string suggestion, not the list suggestedword

# Add sugested word to the dataframe
sellers_df['seller_city'] = suggestedword



# Print or use the updated dataframe matching_word_found
print(sellers_df.head())




                          seller_id seller_zip_code_prefix        seller_city  \
0  3442f8959a84dea7ee197c632cb2df15                  13023           Campinas   
1  d1b65fc7debc3361ea86b5f14c68d2e2                  13844         Mogi Guacu   
2  ce3ad9de960102d0677a81f5d0bb7b2d                  20031     Rio de Janeiro   
3  c0f3eea2e14555b6faeea3dd58c1b1c3                  04195          Sao Paulo   
4  51a04a8a6bdcb23deccc82b0b80742cf                  12914  Braganca Paulista   

  seller_state seller_state_fullname  
0           SP             Sao Paulo  
1           SP             Sao Paulo  
2           RJ        Rio de Janeiro  
3           SP             Sao Paulo  
4           SP             Sao Paulo  


## Connection to postgresql

In [107]:
 #connection to postgresql 

conn =None
cur=None


try:
    conn = psycopg2.connect(
    host='localhost',
    dbname='Database(new)',
    user='postgres',
    password='admin',
    port='5432',
     )
    cur=conn.cursor()
 
    create_script_review ='''CREATE TABLE IF NOT EXISTS olist_sellers_dataset(
        seller_id text primary key,
        seller_zip_code_prefix int,
        seller_city varchar(250),
        seller_state varchar (10),
        seller_state_fullname varchar(250),
        FOREIGN KEY (seller_id) REFERENCES olist_order_items_dataset(seller_id),
        FOREIGN KEY (seller_zip_code_prefix) REFERENCES olist_geolocation_dataset(geolocation_zip_code_prefix)


 )'''


    cur.execute(create_script_review)

    conn.commit()
    for index, row in sellers_df.iterrows():
        insert_script_host = '''INSERT INTO olist_sellers_dataset  (  
        seller_id ,
       seller_zip_code_prefix ,
       seller_city ,
       seller_state,
       seller_state_fullname) VALUES (%s,%s,%s,%s,%s) '''
        values =(row.iloc[0],row.iloc[1],row.iloc[2],row.iloc[3],row.iloc[4])
        cur.execute(insert_script_host,values)
        conn.commit()



except Exception as error:      
    print(error)


finally:

    if cur is not None:

        cur.close()
        conn.close()

duplicate key value violates unique constraint "olist_sellers_dataset_pkey"
DETAIL:  Key (seller_id)=(3442f8959a84dea7ee197c632cb2df15) already exists.



Remarks:

-  Sellers are from brazil
- No duplicate rows found, each row has a unique seller id
- Total seller - 3095

Date types of each column:

-     Column                  Non-Null Count  Dtype 
- ---  ------                  --------------  ----- 
-  0   seller_id               3095 non-null   object
-  1   seller_zip_code_prefix  3095 non-null   int64 
 - 2   seller_city             3095 non-null   object
 - 3   seller_state            3095 non-null   object

- No null values found

Transformation

- Matching of Full State Name based on state abbreviation column 
- Fill the zip code with a leading zero if the length of the zip code is 4 digits
- Checking of spelling in seller_city in sellers_df dataframe
