# <b>Text Data ETL

In [1]:
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import sqlite3

## <b>Downloading Necessary Resources

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NAZMUL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NAZMUL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

## <b>Extract Data

In [3]:
# Function to extract text from files
def extract_text_from_files(folder_path):
    text_data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                text_data.append(file.read())
    return text_data

## <b>Transform Data

In [4]:
# Functions to preprocess the text
def preprocess_text(text):
    # Tokenizing the text
    tokens = word_tokenize(text)
    # Converting to lower case
    tokens = [word.lower() for word in tokens]
    # Removing punctuation and non-alphabetic tokens
    tokens = [word for word in tokens if word.isalpha()]
    # Removing stop-words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Function that transform the texts
def transform_texts(text_data):
    return [preprocess_text(text) for text in text_data]

    

## <b> Load Data

In [5]:
def create_connection(db_file):
    conn = sqlite3.connect(db_file)
    return conn

def create_table(conn):
    create_table_sql = """
    CREATE TABLE IF NOT EXISTS cleaned_texts (
        id INTEGER PRIMARY KEY,
        text TEXT NOT NULL
    );
    """
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except sqlite3.Error as e:
        print(e)

def save_cleaned_texts_to_db(cleaned_texts, db_file):
    conn = create_connection(db_file)
    create_table(conn)
    with conn:
        sql = ''' INSERT INTO cleaned_texts(text) VALUES(?) '''
        cur = conn.cursor()
        for text in cleaned_texts:
            cur.execute(sql, (' '.join(text),))
        conn.commit()

## <b> Main ETL Function 

In [6]:
# Main ETL functions
def main_etl(folder_path, db_file):
    # Extract
    raw_texts = extract_text_from_files(folder_path)
    # Transform
    cleaned_texts = transform_texts(raw_texts)
    # Load
    save_cleaned_texts_to_db(cleaned_texts, db_file)
    print(f"Cleaned texts saved to {db_file}")

# <b>Example Use

In [7]:
# Example of usage:
folder_path = 'Text_Data'
db_file = 'cleaned_text.db'
main_etl(folder_path, db_file)

Cleaned texts saved to cleaned_text.db
