In [1]:
!git clone https://github.com/amnashahid31/AI-project-dataset.git

Cloning into 'AI-project-dataset'...
remote: Enumerating objects: 89484, done.[K
remote: Total 89484 (delta 0), reused 0 (delta 0), pack-reused 89484[K
Receiving objects: 100% (89484/89484), 171.64 MiB | 19.56 MiB/s, done.
Resolving deltas: 100% (5/5), done.
Updating files: 100% (92579/92579), done.


In [2]:
dataset_folder_path = "/content/AI-project-dataset/stories"

In [3]:
#installations
!pip install sentence_transformers scikit-learn

Collecting sentence_transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence_transform

In [4]:
#imports
import pandas as pd
import os
from tqdm.autonotebook import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

  from tqdm.autonotebook import tqdm


In [5]:
# Create a list to store all file directories
all_files = []

# Walk through all directories and files
for root, _, filenames in os.walk(dataset_folder_path):
    for filename in filenames:
        # Get the full file path
        full_path = os.path.join(root, filename)
        # Append the full path to the list
        all_files.append(full_path)

# Display the total number of files found
print("Total files found:", len(all_files))

# Display the file paths to verify
#all_files

Total files found: 92579


In [6]:
# creating a master dataframe

def create_master_dataframe(file_dirs):
    """
    This function takes a list of file directories and creates a master dataframe with the content of each file.
    """
    master_df = pd.DataFrame(columns=['News'])  # Create an empty dataframe to store the text content

    # Loop through each file in the list of directories
    for file_path in tqdm(file_dirs, desc="Creating Master DataFrame"):
        try:
            # Read the file content into a temporary dataframe
            temp_df = pd.read_csv(
                file_path,
                sep='delimiter',  # The 'delimiter' is used to handle files with non-standard delimiters
                header=None,  # If there's no header, set this to None
                engine='python'  # Using Python engine for flexibility in parsing
            )

            # Rename the column to 'text'
            temp_df.columns = ['News']

            # Append the content to the master dataframe
            master_df = pd.concat([master_df, temp_df], ignore_index=True)

        except Exception as e:
            # If there's an error reading the file, print a warning and continue
            print(f"Error reading file {file_path}: {e}")



    return master_df

first_10000_files = all_files[:10]
master_df = create_master_dataframe(first_10000_files)
master_df.shape
master_df


Creating Master DataFrame:   0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,News
0,(CNN) -- Anna Tsuchiya is one of Japan's most ...
1,"Model, mother, actress and singer: Tsuchiya is..."
2,Born in Tokyo in 1984 to a Japanese mother and...
3,Taking on non-traditional female roles is some...
4,"""It's fun to act that kind of character who is..."
...,...
315,Typhoon Haiyan hit the Philippines in November...
316,@highlight
317,CNN's Andrew Stevens returns to the city almos...
318,@highlight


In [7]:
# Data Cleaning

master_df['News'] = master_df['News'].str.strip()  # Remove leading/trailing whitespace
master_df = master_df.dropna()  # Remove rows with missing data
master_df = master_df.loc[~(master_df['News']=='@highlight')] # Remove rows with highlight word only

master_df.shape

(286, 1)

In [9]:
# Load pre-trained BERT model
model = SentenceTransformer('bert-base-nli-mean-tokens')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
# Load pre-trained BERT model
# Sample sentences for each category
categories = {
    'Realistic': [
        'Realistic occupations often require physical strength and coordination.',
        'Jobs in this category may involve working with tools and machinery.',
        'Many realistic occupations involve working outdoors.',
        'People in realistic occupations may work with animals or plants.'
    ],
    'Investigative': [
        'Investigative jobs often require analytical thinking and problem-solving skills.',
        'Professions in this category may involve conducting research and experiments.',
        'People in investigative occupations often work with data and statistics.',
        'Investigative careers often involve uncovering hidden patterns or relationships.'
    ],
    'Artistic': [
        'Artistic professions allow for creative expression and imagination.',
        'Jobs in this category may involve working with various art forms such as painting, sculpting, or music.',
        'Artistic occupations often require originality and innovation.',
        'People in artistic careers may work independently or as part of a creative team.'
    ],
    'Social': [
        'Social occupations involve interacting with people on a regular basis.',
        'Jobs in this category may include counseling, teaching, or social work.',
        'Social careers often require strong communication and interpersonal skills.',
        'People in social occupations may work to support and empower others.'
    ],
    'Enterprising': [
        'Enterprising careers involve taking risks and pursuing new opportunities.',
        'Jobs in this category may include entrepreneurship, sales, or management.',
        'Enterprising occupations often require leadership and decision-making skills.',
        'People in enterprising professions may thrive in competitive environments.'
    ],
    'Conventional': [
        'Conventional jobs involve following established procedures and protocols.',
        'Professions in this category may include administrative roles or data entry.',
        'Conventional careers often require attention to detail and organization.',
        'People in conventional occupations may prefer structured and predictable work environments.'
    ]
}


# Calculate BERT embeddings for sample sentences
category_embeddings = {category: model.encode([sentence])[0] for category, sentence in categories.items()}

# Initialize an empty list to store predicted categories
predicted_categories = []

# Calculate cosine similarity between each sentence in the dataframe and sample sentences
for index, row in master_df.iterrows():
    sentence = row['News']  # Assuming the column name containing sentences is 'News'
    sentence_embedding = model.encode([sentence])[0]
    similarities = {}
    for category, category_embedding in category_embeddings.items():
        similarities[category] = cosine_similarity([sentence_embedding], [category_embedding])[0][0]
    predicted_category = max(similarities, key=similarities.get)
    predicted_categories.append(predicted_category)

# Add the list of predicted categories as a new column 'Predicted Category' to master_df
master_df['Category'] = predicted_categories

# Print the updated DataFrame
print(master_df)

model = SentenceTransformer('bert-base-nli-mean-tokens')

                                                  News       Category
0    (CNN) -- Anna Tsuchiya is one of Japan's most ...   Enterprising
1    Model, mother, actress and singer: Tsuchiya is...         Social
2    Born in Tokyo in 1984 to a Japanese mother and...   Enterprising
3    Taking on non-traditional female roles is some...  Investigative
4    "It's fun to act that kind of character who is...  Investigative
..                                                 ...            ...
312  His family is divided about whether they shoul...         Social
313  IN PICTURES: Tacloban after Typhoon Haiyan and...   Enterprising
315  Typhoon Haiyan hit the Philippines in November...   Enterprising
317  CNN's Andrew Stevens returns to the city almos...         Social
319  Landscape littered with wrecked cars, torn-dow...  Investigative

[286 rows x 2 columns]


In [11]:
master_df['Category'].value_counts()

Category
Enterprising     140
Investigative     98
Social            20
Conventional      17
Realistic          7
Artistic           4
Name: count, dtype: int64

In [12]:
from itertools import permutations
l = ['realistic','investigative','artistic','social','enterprising','conventional']
print('Total number of combination possible: {}'.format(len(list(permutations(l,3)))))

Total number of combination possible: 120


In [13]:
from itertools import permutations

def combine_sentences_with_categories(df):
    combined_data = []

    # Iterate through the DataFrame in groups of three sentences
    for i in range(0, len(df), 3):
        group = df.iloc[i:i+3]  # Get a group of three sentences

        # Combine the sentences and categories for the group
        combined_sentence = ' '.join(group['News'].values)
        combined_categories = ','.join(group['Category'].values)

        # Append the combined data to the list
        combined_data.append({'Combined Sentence': combined_sentence, 'Combined Categories': combined_categories})

    # Create a new DataFrame from the combined data
    combined_df = pd.DataFrame(combined_data)
    return combined_df

# Call the function with the labeled DataFrame master_df
combined_df = combine_sentences_with_categories(master_df)

# Print the new DataFrame
print(combined_df)


                                    Combined Sentence  \
0   (CNN) -- Anna Tsuchiya is one of Japan's most ...   
1   Taking on non-traditional female roles is some...   
2   More recently, Anna been taking to the stage, ...   
3   "There is an image of women by which someone d...   
4   "Death is a sad thing but it has to happen and...   
..                                                ...   
91  It is the resilience of the people here that s...   
92  However, the rebuilding stage needs to be fast...   
93  Toto was angry at the pace of the rebuilding b...   
94  IN PICTURES: Tacloban after Typhoon Haiyan and...   
95  Landscape littered with wrecked cars, torn-dow...   

                         Combined Categories  
0           Enterprising,Social,Enterprising  
1   Investigative,Investigative,Enterprising  
2          Enterprising,Investigative,Social  
3     Enterprising,Enterprising,Enterprising  
4   Investigative,Investigative,Enterprising  
..                               

In [14]:
# Display the DataFrame as a table
from IPython.display import display
display(combined_df)

Unnamed: 0,Combined Sentence,Combined Categories
0,(CNN) -- Anna Tsuchiya is one of Japan's most ...,"Enterprising,Social,Enterprising"
1,Taking on non-traditional female roles is some...,"Investigative,Investigative,Enterprising"
2,"More recently, Anna been taking to the stage, ...","Enterprising,Investigative,Social"
3,"""There is an image of women by which someone d...","Enterprising,Enterprising,Enterprising"
4,"""Death is a sad thing but it has to happen and...","Investigative,Investigative,Enterprising"
...,...,...
91,It is the resilience of the people here that s...,"Enterprising,Social,Social"
92,"However, the rebuilding stage needs to be fast...","Realistic,Enterprising,Enterprising"
93,Toto was angry at the pace of the rebuilding b...,"Enterprising,Enterprising,Social"
94,IN PICTURES: Tacloban after Typhoon Haiyan and...,"Enterprising,Enterprising,Social"


In [15]:
# Split the 'Category Permutations' column by comma and expand it into separate columns
categories = combined_df['Combined Categories'].str.split(',', expand=True)

# Iterate over unique categories and create binary columns
for category in categories.stack().unique():
    combined_df[category] = categories.apply(lambda row: 1 if category in row.values else 0, axis=1)

# Drop the original 'Category Permutations' column
combined_df = combined_df.drop(columns=['Combined Categories'])

# Display the updated DataFrame
print(combined_df)


                                    Combined Sentence  Enterprising  Social  \
0   (CNN) -- Anna Tsuchiya is one of Japan's most ...             1       1   
1   Taking on non-traditional female roles is some...             1       0   
2   More recently, Anna been taking to the stage, ...             1       1   
3   "There is an image of women by which someone d...             1       0   
4   "Death is a sad thing but it has to happen and...             1       0   
..                                                ...           ...     ...   
91  It is the resilience of the people here that s...             1       1   
92  However, the rebuilding stage needs to be fast...             1       0   
93  Toto was angry at the pace of the rebuilding b...             1       1   
94  IN PICTURES: Tacloban after Typhoon Haiyan and...             1       1   
95  Landscape littered with wrecked cars, torn-dow...             0       0   

    Investigative  Artistic  Conventional  Realisti

In [16]:
# Display the DataFrame as a table
from IPython.display import display
display(combined_df)

Unnamed: 0,Combined Sentence,Enterprising,Social,Investigative,Artistic,Conventional,Realistic
0,(CNN) -- Anna Tsuchiya is one of Japan's most ...,1,1,0,0,0,0
1,Taking on non-traditional female roles is some...,1,0,1,0,0,0
2,"More recently, Anna been taking to the stage, ...",1,1,1,0,0,0
3,"""There is an image of women by which someone d...",1,0,0,0,0,0
4,"""Death is a sad thing but it has to happen and...",1,0,1,0,0,0
...,...,...,...,...,...,...,...
91,It is the resilience of the people here that s...,1,1,0,0,0,0
92,"However, the rebuilding stage needs to be fast...",1,0,0,0,0,1
93,Toto was angry at the pace of the rebuilding b...,1,1,0,0,0,0
94,IN PICTURES: Tacloban after Typhoon Haiyan and...,1,1,0,0,0,0


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch


In [18]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to find the maximum length of tokenized sequences
def find_max_length(df):
    max_length = 0

    # Iterate over each combined sentence
    for index, row in df.iterrows():
        # Tokenize the combined sentence
        tokens = tokenizer.encode(row['Combined Sentence'], add_special_tokens=True)
        # Update max_length if the length of tokens exceeds current max_length
        max_length = max(max_length, len(tokens))

    return max_length

# Find the maximum length
MAX_LEN = find_max_length(combined_df)
print("Maximum length of tokenized sequences:", MAX_LEN)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Maximum length of tokenized sequences: 264
