In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import nltk
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def load_dataset(csv_path):
    return pd.read_csv(csv_path)

In [None]:
def preprocess_data(df):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['COBOL'])
    y = df['Java']  # Adjusted to use Java column
    return X, y, vectorizer

In [None]:
def train_model(X, y):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)
    return model

In [None]:
def predict_java_keyword(model, vectorizer, cobol_keyword):
    cobol_keyword_vectorized = vectorizer.transform([cobol_keyword])
    java_keyword = model.predict(cobol_keyword_vectorized)
    return java_keyword[0]

In [None]:
def tokenize_and_predict(model, vectorizer, statement, df):
    multi_word_keywords = {
        'IS GREATER THAN': '>',
        'IS LESSER THAN': '<',
        'IS EQUAL TO': '==',
        'IS GREATER THAN AND EQUAL TO': '>=',
        'IS LESSER THAN AND EQUAL TO': '<='
    }

    tokens = word_tokenize(statement)

    predicted_keywords = []

    i = 0
    while i < len(tokens):
        if i < len(tokens) - 2:
            three_word_token = ' '.join(tokens[i:i+3]).upper()
            if three_word_token in multi_word_keywords:
                predicted_keywords.append(multi_word_keywords[three_word_token])
                i += 3
                continue

        if i < len(tokens) - 3:
            four_word_token = ' '.join(tokens[i:i+4]).upper()
            if four_word_token in multi_word_keywords:
                predicted_keywords.append(multi_word_keywords[four_word_token])
                i += 4
                continue

        if tokens[i].upper() in df['COBOL'].str.upper().values:
            predicted_keyword = predict_java_keyword(model, vectorizer, tokens[i].upper())  # Adjusted to predict Java keyword
        else:
            predicted_keyword = tokens[i]

        predicted_keywords.append(predicted_keyword)
        i += 1

    return predicted_keywords

In [None]:
def format_java_code(predicted_keywords):
    java_code_lines = []
    indent_level = 0
    indent_spaces = '  '

    skip_next = False
    display_text = ''

    for i, keyword in enumerate(predicted_keywords):
        if skip_next:
            skip_next = False
            continue

        if keyword in ['if', 'else', 'while', 'for', 'class', 'public', 'private', 'protected', 'static', 'void', 'int', 'float', 'double', 'String']:
            line = indent_spaces * indent_level + keyword
            java_code_lines.append(line)
            indent_level += 1
        elif keyword in ['endif', 'endperform', 'endread', 'endwrite']:
            indent_level -= 1
            line = indent_spaces * indent_level + '}'
            java_code_lines.append(line)
        elif keyword == '{':
            java_code_lines[-1] += ' {'
            java_code_lines.append('\n' + indent_spaces * indent_level)
        elif keyword == '':
            java_code_lines.append('')
        elif keyword == 'System.out.println':
            java_code_lines.append('\n' + indent_spaces * indent_level + 'System.out.println(' + display_text)
            display_text = ''
        elif keyword.startswith('"') and keyword.endswith('"'):
            display_text = keyword  # Store the quoted text for the print statement
        else:
            if keyword == 'THEN':
                continue
            if i + 1 < len(predicted_keywords) and predicted_keywords[i + 1] != '{':
                java_code_lines.append(indent_spaces * indent_level + keyword + ' ')
            else:
                java_code_lines.append(indent_spaces * indent_level + keyword + ')')

    formatted_code = ''.join(java_code_lines)

    return formatted_code

In [None]:
if __name__ == "__main__":
    csv_path = '/content/cobol_to_java_mapping_java.csv'
    df = load_dataset(csv_path)

    X, y, vectorizer = preprocess_data(df)

    model = train_model(X, y)

    while True:
        statement = input("Enter a statement to tokenize (or type 'exit' to quit): ")

        if statement.lower() == 'exit':
            print("Exiting the program.")
            break

        if any(char.islower() for char in statement):
            print("Error: Statement must contain only uppercase characters.")
            continue

        predicted_keywords = tokenize_and_predict(model, vectorizer, statement, df)

        java_code = format_java_code(predicted_keywords)

        print("Formatted Java Code:\n", java_code)

Formatted Java Code:
 
System.out.println(`` HELLO '')


In [None]:
if __name__ == "__main__":
    main()